From 033ffc9c1438e629599affc547e410a388f19236 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Sat, 31 Jan 2026 13:42:00 -0800 Subject: [PATCH 01/29] save --- pgdog/src/backend/schema/mod.rs | 1 + pgdog/src/backend/schema/postgres_fdw/mod.rs | 7 + .../schema/postgres_fdw/postgres_fdw.sql | 34 ++ .../src/backend/schema/postgres_fdw/schema.rs | 137 ++++++ .../backend/schema/postgres_fdw/statement.rs | 456 ++++++++++++++++++ 5 files changed, 635 insertions(+) create mode 100644 pgdog/src/backend/schema/postgres_fdw/mod.rs create mode 100644 pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql create mode 100644 pgdog/src/backend/schema/postgres_fdw/schema.rs create mode 100644 pgdog/src/backend/schema/postgres_fdw/statement.rs diff --git a/pgdog/src/backend/schema/mod.rs b/pgdog/src/backend/schema/mod.rs index 009bb63e..bae06068 100644 --- a/pgdog/src/backend/schema/mod.rs +++ b/pgdog/src/backend/schema/mod.rs @@ -1,5 +1,6 @@ //! Schema operations. pub mod columns; +pub mod postgres_fdw; pub mod relation; pub mod sync; diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs new file mode 100644 index 00000000..3557181a --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -0,0 +1,7 @@ +//! Schema information for creating foreign tables via postgres_fdw. + +mod schema; +mod statement; + +pub use schema::{ForeignTableColumn, FOREIGN_TABLE_SCHEMA}; +pub use statement::{create_foreign_table, ForeignTableBuilder, PartitionStrategy}; diff --git a/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql b/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql new file mode 100644 index 00000000..a4555468 --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql @@ -0,0 +1,34 @@ +SELECT + n.nspname::text AS schema_name, + c.relname::text AS table_name, + a.attname::text AS column_name, + pg_catalog.format_type(a.atttypid, a.atttypmod)::text AS column_type, + a.attnotnull::text AS is_not_null, + pg_catalog.pg_get_expr(ad.adbin, ad.adrelid)::text AS column_default, + a.attgenerated::text AS generated, + coll.collname::text AS collation_name, + collnsp.nspname::text AS collation_schema +FROM pg_catalog.pg_class c +JOIN pg_catalog.pg_namespace n ON + c.relnamespace = n.oid +LEFT JOIN pg_catalog.pg_attribute a ON + a.attrelid = c.oid + AND a.attnum > 0 + AND NOT a.attisdropped +LEFT JOIN pg_catalog.pg_attrdef ad ON + ad.adrelid = c.oid + AND ad.adnum = a.attnum +LEFT JOIN pg_catalog.pg_collation coll ON + coll.oid = a.attcollation +LEFT JOIN pg_catalog.pg_namespace collnsp ON + collnsp.oid = coll.collnamespace +WHERE + c.relkind IN ('r', 'v', 'f', 'm', 'p') + AND n.nspname <> 'pg_catalog' + AND n.nspname !~ '^pg_toast' + AND n.nspname <> 'information_schema' + AND NOT c.relispartition +ORDER BY + n.nspname, + c.relname, + a.attnum diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs new file mode 100644 index 00000000..5016e79d --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -0,0 +1,137 @@ +//! Foreign table schema query and data structures. + +use std::collections::HashMap; + +use crate::{backend::Server, net::messages::DataRow}; + +/// Query to fetch table and column information needed for CREATE FOREIGN TABLE statements. +pub static FOREIGN_TABLE_SCHEMA: &str = include_str!("postgres_fdw.sql"); + +/// Row from the foreign table schema query. +/// Each row represents a single column in a table, or a table with no columns. +#[derive(Debug, Clone)] +pub struct ForeignTableColumn { + pub schema_name: String, + pub table_name: String, + /// Empty if the table has no columns. + pub column_name: String, + /// Column type with modifiers, e.g. "character varying(255)". + pub column_type: String, + pub is_not_null: bool, + /// Default expression, also used for generated column expressions. + pub column_default: String, + /// 's' for stored generated column, empty otherwise. + pub generated: String, + pub collation_name: String, + pub collation_schema: String, +} + +impl From for ForeignTableColumn { + fn from(value: DataRow) -> Self { + Self { + schema_name: value.get_text(0).unwrap_or_default(), + table_name: value.get_text(1).unwrap_or_default(), + column_name: value.get_text(2).unwrap_or_default(), + column_type: value.get_text(3).unwrap_or_default(), + is_not_null: value.get_text(4).unwrap_or_default() == "true", + column_default: value.get_text(5).unwrap_or_default(), + generated: value.get_text(6).unwrap_or_default(), + collation_name: value.get_text(7).unwrap_or_default(), + collation_schema: value.get_text(8).unwrap_or_default(), + } + } +} + +#[derive(Debug, Clone)] +pub struct ForeignTableSchema { + tables: HashMap<(String, String), Vec>, +} + +impl ForeignTableSchema { + pub(crate) async fn load(server: &mut Server) -> Result { + Ok(Self { + tables: ForeignTableColumn::load(server).await?, + }) + } +} + +impl ForeignTableColumn { + /// Check if this column is a stored generated column. + pub(super) fn is_generated(&self) -> bool { + self.generated == "s" + } + + /// Check if this column has a collation. + pub(super) fn has_collation(&self) -> bool { + !self.collation_name.is_empty() && !self.collation_schema.is_empty() + } + + /// Fetch columns and organize by schema and table name. + async fn load( + server: &mut Server, + ) -> Result>, crate::backend::Error> { + let mut result = HashMap::new(); + let rows: Vec = server.fetch_all(FOREIGN_TABLE_SCHEMA).await?; + + for row in rows { + let entry = result + .entry((row.schema_name.clone(), row.table_name.clone())) + .or_insert_with(Vec::new); + entry.push(row); + } + + Ok(result) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::backend::server::test::test_server; + + #[tokio::test] + async fn test_load_foreign_table_schema() { + let mut server = test_server().await; + + server + .execute("DROP TABLE IF EXISTS test_fdw_schema") + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_fdw_schema ( + id BIGINT NOT NULL, + name VARCHAR(100) DEFAULT 'unknown', + score NUMERIC(10, 2), + created_at TIMESTAMP NOT NULL DEFAULT now() + )", + ) + .await + .unwrap(); + + let schema = ForeignTableSchema::load(&mut server).await.unwrap(); + let rows: Vec<_> = schema.tables.into_values().flatten().collect(); + + assert!(!rows.is_empty()); + + let test_rows: Vec<_> = rows + .iter() + .filter(|r| r.table_name == "test_fdw_schema") + .collect(); + assert_eq!(test_rows.len(), 4); + + let id_col = test_rows.iter().find(|r| r.column_name == "id").unwrap(); + assert!(id_col.is_not_null); + assert!(id_col.column_default.is_empty()); + + let name_col = test_rows.iter().find(|r| r.column_name == "name").unwrap(); + assert!(!name_col.is_not_null); + assert!(!name_col.column_default.is_empty()); + + server + .execute("DROP TABLE IF EXISTS test_fdw_schema") + .await + .unwrap(); + } +} diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs new file mode 100644 index 00000000..bc2dc1d5 --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -0,0 +1,456 @@ +//! CREATE FOREIGN TABLE statement generation. + +use std::fmt::Write; + +use crate::backend::replication::ShardedTables; +use crate::config::ShardedTable; +use crate::frontend::router::sharding::Mapping; + +use super::ForeignTableColumn; + +/// Partition strategy for a sharded table. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PartitionStrategy { + Hash, + List, + Range, +} + +impl PartitionStrategy { + /// Determine partition strategy from sharded table config. + fn from_sharded_table(table: &ShardedTable) -> Self { + match &table.mapping { + Some(Mapping::List(_)) => Self::List, + Some(Mapping::Range(_)) => Self::Range, + None => Self::Hash, + } + } + + /// SQL keyword for this partition strategy. + fn as_sql(&self) -> &'static str { + match self { + Self::Hash => "HASH", + Self::List => "LIST", + Self::Range => "RANGE", + } + } +} + +/// Quote an identifier if needed (simple Postgres-style quoting). +fn quote_identifier(name: &str) -> String { + let needs_quoting = name.is_empty() + || !name.starts_with(|c: char| c.is_ascii_lowercase() || c == '_') + || name.starts_with('_') && name.chars().nth(1).is_some_and(|c| c.is_ascii_digit()) + || !name + .chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_'); + + if needs_quoting { + format!("\"{}\"", name.replace('"', "\"\"")) + } else { + name.to_string() + } +} + +/// Escape a string literal for use in SQL. +fn escape_literal(s: &str) -> String { + format!("'{}'", s.replace('\'', "''")) +} + +/// Builder for CREATE FOREIGN TABLE statements. +pub struct ForeignTableBuilder<'a> { + columns: &'a [ForeignTableColumn], + server_name: &'a str, + sharded_tables: &'a ShardedTables, +} + +impl<'a> ForeignTableBuilder<'a> { + /// Create a new builder with required parameters. + pub fn new( + columns: &'a [ForeignTableColumn], + server_name: &'a str, + sharded_tables: &'a ShardedTables, + ) -> Self { + Self { + columns, + server_name, + sharded_tables, + } + } + + /// Find the sharding configuration for this table, if any. + fn find_sharded_config(&self) -> Option<&ShardedTable> { + let first = self.columns.first()?; + let table_name = &first.table_name; + let schema_name = &first.schema_name; + + for candidate in self.sharded_tables.tables() { + // Match table name if specified + if let Some(ref name) = candidate.name { + if name != table_name { + continue; + } + } + + // Match schema if specified + if let Some(ref schema) = candidate.schema { + if schema != schema_name { + continue; + } + } + + // Check if the shard column exists in this table + let has_column = self + .columns + .iter() + .any(|col| col.column_name == candidate.column); + + if has_column { + return Some(candidate); + } + } + + None + } + + /// Build the CREATE FOREIGN TABLE statement. + pub fn build(self) -> Option { + let first = self.columns.first()?; + let schema_name = &first.schema_name; + let table_name = &first.table_name; + + let mut sql = String::new(); + writeln!( + sql, + "CREATE FOREIGN TABLE {} (", + quote_identifier(table_name) + ) + .ok()?; + + let mut first_col = true; + for col in self.columns { + if col.column_name.is_empty() { + continue; + } + + if first_col { + first_col = false; + } else { + sql.push_str(",\n"); + } + + write!( + sql, + " {} {}", + quote_identifier(&col.column_name), + col.column_type + ) + .ok()?; + + write!( + sql, + " OPTIONS (column_name {})", + escape_literal(&col.column_name) + ) + .ok()?; + + if col.has_collation() { + write!( + sql, + " COLLATE {}.{}", + quote_identifier(&col.collation_schema), + quote_identifier(&col.collation_name) + ) + .ok()?; + } + + if !col.column_default.is_empty() { + if col.is_generated() { + write!(sql, " GENERATED ALWAYS AS ({}) STORED", col.column_default).ok()?; + } else { + write!(sql, " DEFAULT {}", col.column_default).ok()?; + } + } + + if col.is_not_null { + sql.push_str(" NOT NULL"); + } + } + + sql.push('\n'); + sql.push(')'); + + // Add PARTITION BY clause if table is sharded + if let Some(sharded) = self.find_sharded_config() { + let strategy = PartitionStrategy::from_sharded_table(sharded); + write!( + sql, + " PARTITION BY {} ({})", + strategy.as_sql(), + quote_identifier(&sharded.column) + ) + .ok()?; + } + + // Add SERVER and OPTIONS + write!( + sql, + "\nSERVER {}\nOPTIONS (schema_name {}, table_name {})", + quote_identifier(self.server_name), + escape_literal(schema_name), + escape_literal(table_name) + ) + .ok()?; + + Some(sql) + } +} + +/// Generate a CREATE FOREIGN TABLE statement from column definitions. +/// +/// All columns must belong to the same table. The server_name is the foreign server +/// to reference in the statement. If the table is found in sharded_tables configuration, +/// adds the appropriate PARTITION BY clause (HASH, LIST, or RANGE). +pub fn create_foreign_table( + columns: &[ForeignTableColumn], + server_name: &str, + sharded_tables: &ShardedTables, +) -> Option { + ForeignTableBuilder::new(columns, server_name, sharded_tables).build() +} + +#[cfg(test)] +mod test { + use std::collections::HashSet; + + use super::*; + use crate::config::{DataType, FlexibleType, ShardedMapping, ShardedMappingKind}; + + fn test_column(name: &str, col_type: &str) -> ForeignTableColumn { + ForeignTableColumn { + schema_name: "public".into(), + table_name: "test_table".into(), + column_name: name.into(), + column_type: col_type.into(), + is_not_null: false, + column_default: String::new(), + generated: String::new(), + collation_name: String::new(), + collation_schema: String::new(), + } + } + + fn test_sharded_table(table: &str, column: &str) -> ShardedTable { + ShardedTable { + database: "test".into(), + name: Some(table.into()), + schema: Some("public".into()), + column: column.into(), + primary: false, + centroids: vec![], + centroids_path: None, + data_type: DataType::Bigint, + centroid_probes: 0, + hasher: Default::default(), + mapping: None, + } + } + + fn test_sharded_table_with_mapping( + table: &str, + column: &str, + mapping: Mapping, + ) -> ShardedTable { + ShardedTable { + mapping: Some(mapping), + ..test_sharded_table(table, column) + } + } + + fn list_mapping() -> Mapping { + let mapping = ShardedMapping { + database: "test".into(), + column: "region".into(), + table: Some("test_table".into()), + schema: Some("public".into()), + kind: ShardedMappingKind::List, + start: None, + end: None, + values: HashSet::from([FlexibleType::String("us".into())]), + shard: 0, + }; + Mapping::new(&[mapping]).unwrap() + } + + fn range_mapping() -> Mapping { + let mapping = ShardedMapping { + database: "test".into(), + column: "id".into(), + table: Some("test_table".into()), + schema: Some("public".into()), + kind: ShardedMappingKind::Range, + start: Some(FlexibleType::Integer(0)), + end: Some(FlexibleType::Integer(1000)), + values: HashSet::new(), + shard: 0, + }; + Mapping::new(&[mapping]).unwrap() + } + + #[test] + fn test_create_foreign_table_basic() { + let columns = vec![ + ForeignTableColumn { + is_not_null: true, + ..test_column("id", "bigint") + }, + ForeignTableColumn { + column_default: "'unknown'::character varying".into(), + ..test_column("name", "character varying(100)") + }, + ]; + + let sharded_tables = ShardedTables::default(); + let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + + assert!(sql.contains("CREATE FOREIGN TABLE")); + assert!(sql.contains("test_table")); + assert!(sql.contains("bigint")); + assert!(sql.contains("NOT NULL")); + assert!(sql.contains("OPTIONS (column_name 'id')")); + assert!(sql.contains("character varying(100)")); + assert!(sql.contains("DEFAULT 'unknown'::character varying")); + assert!(sql.contains("SERVER")); + assert!(sql.contains("remote_server")); + assert!(sql.contains("schema_name 'public'")); + assert!(!sql.contains("PARTITION BY")); + } + + #[test] + fn test_create_foreign_table_with_hash_sharding() { + let columns = vec![ + ForeignTableColumn { + is_not_null: true, + ..test_column("id", "bigint") + }, + test_column("name", "text"), + ]; + + let sharded_tables: ShardedTables = + [test_sharded_table("test_table", "id")].as_slice().into(); + + let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + + assert!(sql.contains("CREATE FOREIGN TABLE")); + assert!(sql.contains("PARTITION BY HASH (id)")); + assert!(sql.contains("SERVER remote_server")); + } + + #[test] + fn test_create_foreign_table_with_list_sharding() { + let columns = vec![test_column("id", "bigint"), test_column("region", "text")]; + + let sharded_tables: ShardedTables = [test_sharded_table_with_mapping( + "test_table", + "region", + list_mapping(), + )] + .as_slice() + .into(); + + let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + + assert!(sql.contains("PARTITION BY LIST (region)")); + } + + #[test] + fn test_create_foreign_table_with_range_sharding() { + let columns = vec![test_column("id", "bigint"), test_column("name", "text")]; + + let sharded_tables: ShardedTables = [test_sharded_table_with_mapping( + "test_table", + "id", + range_mapping(), + )] + .as_slice() + .into(); + + let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + + assert!(sql.contains("PARTITION BY RANGE (id)")); + } + + #[test] + fn test_create_foreign_table_no_shard_match() { + let columns = vec![test_column("id", "bigint"), test_column("name", "text")]; + + // Sharded table config for different table + let sharded_tables: ShardedTables = [test_sharded_table("other_table", "user_id")] + .as_slice() + .into(); + + let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + + assert!(!sql.contains("PARTITION BY")); + } + + #[test] + fn test_create_foreign_table_column_mismatch() { + let columns = vec![test_column("id", "bigint"), test_column("name", "text")]; + + // Sharded table matches by name but column doesn't exist + let sharded_tables: ShardedTables = [test_sharded_table("test_table", "user_id")] + .as_slice() + .into(); + + let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + + assert!(!sql.contains("PARTITION BY")); + } + + #[test] + fn test_create_foreign_table_with_generated() { + let columns = vec![ForeignTableColumn { + column_default: "(price * quantity)".into(), + generated: "s".into(), + ..test_column("total", "numeric") + }]; + + let sharded_tables = ShardedTables::default(); + let sql = create_foreign_table(&columns, "srv", &sharded_tables).unwrap(); + + assert!(sql.contains("GENERATED ALWAYS AS ((price * quantity)) STORED")); + assert!(!sql.contains("DEFAULT")); + } + + #[test] + fn test_create_foreign_table_with_collation() { + let columns = vec![ForeignTableColumn { + collation_name: "en_US".into(), + collation_schema: "pg_catalog".into(), + ..test_column("title", "text") + }]; + + let sharded_tables = ShardedTables::default(); + let sql = create_foreign_table(&columns, "srv", &sharded_tables).unwrap(); + + assert!(sql.contains("COLLATE pg_catalog.\"en_US\"")); + } + + #[test] + fn test_create_foreign_table_empty_columns() { + let sharded_tables = ShardedTables::default(); + let result = create_foreign_table(&[], "srv", &sharded_tables); + assert!(result.is_none()); + } + + #[test] + fn test_quote_identifier() { + assert_eq!(quote_identifier("users"), "users"); + assert_eq!(quote_identifier("my table"), "\"my table\""); + assert_eq!(quote_identifier("123abc"), "\"123abc\""); + assert_eq!(quote_identifier("has\"quote"), "\"has\"\"quote\""); + assert_eq!(quote_identifier("CamelCase"), "\"CamelCase\""); + assert_eq!(quote_identifier("_valid"), "_valid"); + } +} From d82eb8c17247497bad51571fad7135f23d2bccbe Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Sat, 31 Jan 2026 15:32:56 -0800 Subject: [PATCH 02/29] postgres --- Cargo.lock | 24 ++- pgdog/Cargo.toml | 1 + pgdog/src/backend/fdw/config_parser.rs | 54 +++++ pgdog/src/backend/fdw/error.rs | 19 ++ pgdog/src/backend/fdw/mod.rs | 6 + pgdog/src/backend/fdw/postgres.rs | 261 +++++++++++++++++++++++++ pgdog/src/backend/mod.rs | 1 + 7 files changed, 360 insertions(+), 6 deletions(-) create mode 100644 pgdog/src/backend/fdw/config_parser.rs create mode 100644 pgdog/src/backend/fdw/error.rs create mode 100644 pgdog/src/backend/fdw/mod.rs create mode 100644 pgdog/src/backend/fdw/postgres.rs diff --git a/Cargo.lock b/Cargo.lock index e307307b..ed2fab0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1824,9 +1824,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.172" +version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libloading" @@ -1940,7 +1940,7 @@ version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0aeb26bf5e836cc1c341c8106051b573f1766dfa05aa87f0b98be5e51b02303" dependencies = [ - "nix", + "nix 0.29.0", "winapi", ] @@ -2078,6 +2078,18 @@ dependencies = [ "memoffset", ] +[[package]] +name = "nix" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225e7cfe711e0ba79a68baeddb2982723e4235247aefce1482f2f16c27865b66" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "cfg_aliases", + "libc", +] + [[package]] name = "nom" version = "7.1.3" @@ -2421,6 +2433,7 @@ dependencies = [ "lazy_static", "lru 0.16.0", "md5", + "nix 0.31.1", "once_cell", "parking_lot", "pg_query", @@ -2512,7 +2525,7 @@ dependencies = [ "pgdog-vector", "rust_decimal", "serde", - "thiserror 1.0.69", + "thiserror 2.0.12", "uuid", ] @@ -3879,7 +3892,6 @@ dependencies = [ "cfg-if", "libc", "psm", - "windows-sys 0.52.0", "windows-sys 0.59.0", ] @@ -4067,7 +4079,7 @@ dependencies = [ "libc", "log", "memmem", - "nix", + "nix 0.29.0", "num-derive", "num-traits", "ordered-float", diff --git a/pgdog/Cargo.toml b/pgdog/Cargo.toml index 9464bb7e..15903da3 100644 --- a/pgdog/Cargo.toml +++ b/pgdog/Cargo.toml @@ -63,6 +63,7 @@ hickory-resolver = "0.25.2" lazy_static = "1" dashmap = "6" derive_builder = "0.20.2" +nix = { version = "0.31", features = ["signal"] } pgdog-config = { path = "../pgdog-config" } pgdog-vector = { path = "../pgdog-vector" } pgdog-stats = { path = "../pgdog-stats" } diff --git a/pgdog/src/backend/fdw/config_parser.rs b/pgdog/src/backend/fdw/config_parser.rs new file mode 100644 index 00000000..742494ed --- /dev/null +++ b/pgdog/src/backend/fdw/config_parser.rs @@ -0,0 +1,54 @@ +use std::path::{Path, PathBuf}; +use tokio::{ + fs::{read_to_string, File}, + io::AsyncWriteExt, +}; + +use super::Error; + +#[derive(Debug, Clone)] +pub(crate) struct ConfigParser { + path: PathBuf, + content: String, +} + +impl ConfigParser { + /// Load configuration from path. + pub(crate) async fn load(path: impl AsRef) -> Result { + let path = PathBuf::from(path.as_ref()); + // let content = read_to_string(&path).await?; + + Ok(Self { + path, + content: String::new(), + }) + } + + /// Add a setting + pub(crate) fn set(&mut self, name: &str, value: &str) { + self.content.push_str(&format!("{} = {}\n", name, value)); + } + + /// Save configuration. + pub(crate) async fn save(&self) -> Result<(), Error> { + let mut file = File::create(&self.path).await?; + file.write_all(self.content.as_bytes()).await?; + Ok(()) + } + + /// Configure default settings we need off/on. + pub(crate) async fn configure_and_save(&mut self, port: u16) -> Result<(), Error> { + self.set("max_logical_replication_workers", "0"); + self.set("max_sync_workers_per_subscription", "0"); + self.set("max_parallel_apply_workers_per_subscription", "0"); + self.set("port", &port.to_string()); + self.set("max_connections", "100"); + self.set("log_line_prefix", "''"); + + self.save().await?; + + println!("{}", read_to_string(&self.path).await.unwrap()); + + Ok(()) + } +} diff --git a/pgdog/src/backend/fdw/error.rs b/pgdog/src/backend/fdw/error.rs new file mode 100644 index 00000000..a0805a2b --- /dev/null +++ b/pgdog/src/backend/fdw/error.rs @@ -0,0 +1,19 @@ +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum Error { + #[error("io: {0}")] + Io(#[from] std::io::Error), + + #[error("initdb failed")] + InitDb, + + #[error("backend: {0}")] + Backend(#[from] crate::backend::Error), + + #[error("postgres didn't launch in time")] + Timeout(#[from] tokio::time::error::Elapsed), + + #[error("nix: {0}")] + Nix(#[from] nix::Error), +} diff --git a/pgdog/src/backend/fdw/mod.rs b/pgdog/src/backend/fdw/mod.rs new file mode 100644 index 00000000..1c801bf9 --- /dev/null +++ b/pgdog/src/backend/fdw/mod.rs @@ -0,0 +1,6 @@ +pub mod config_parser; +pub mod error; +pub mod postgres; + +pub(crate) use config_parser::ConfigParser; +pub use error::Error; diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs new file mode 100644 index 00000000..c25e4edc --- /dev/null +++ b/pgdog/src/backend/fdw/postgres.rs @@ -0,0 +1,261 @@ +use std::{ + path::{Path, PathBuf}, + process::Stdio, + sync::Arc, + time::Duration, +}; + +#[cfg(unix)] +use nix::{ + sys::signal::{kill, Signal}, + unistd::Pid, +}; + +use once_cell::sync::Lazy; +use regex::Regex; +use tokio::{ + fs::remove_dir_all, + io::{AsyncBufReadExt, BufReader}, + process::{Child, Command}, + select, spawn, + sync::Notify, + time::{sleep, timeout}, +}; +use tracing::{error, info}; + +use crate::backend::{ + pool::{Address, Config, PoolConfig}, + ConnectReason, Pool, Server, ServerOptions, +}; + +use super::{ConfigParser, Error}; + +static LOG_PREFIX: Lazy = + Lazy::new(|| Regex::new(r"^(LOG|WARNING|ERROR|FATAL|PANIC|DEBUG\d?|INFO|NOTICE):\s+").unwrap()); + +struct PostgresProcessAsync { + child: Child, + initdb_dir: PathBuf, + notify: Arc, +} + +impl PostgresProcessAsync { + /// Stop Postgres and cleanup. + async fn stop(&mut self) -> Result<(), Error> { + #[cfg(unix)] + { + let pid = self.child.id().expect("child has no pid") as i32; + let pid = Pid::from_raw(pid); + kill(pid, Signal::SIGINT)?; + } + + #[cfg(not(unix))] + self.child.kill().await?; + + self.child.wait().await?; + + // Delete data dir, its ephemeral. + remove_dir_all(&self.initdb_dir).await?; + + Ok(()) + } +} + +pub(crate) struct PostgresProcess { + postres: PathBuf, + initdb: PathBuf, + initdb_dir: PathBuf, + socket_dir: PathBuf, + notify: Arc, + port: u16, +} + +impl PostgresProcess { + pub(crate) fn new(initdb_path: impl AsRef, port: u16) -> Self { + let notify = Arc::new(Notify::new()); + + Self { + postres: PathBuf::from("postgres"), + initdb: PathBuf::from("initdb"), + socket_dir: PathBuf::from("/tmp"), + initdb_dir: initdb_path.as_ref().to_owned(), + notify, + port, + } + } + + /// Setup and launch Postgres process. + pub(crate) async fn launch(&mut self) -> Result<(), Error> { + info!("[fdw] initializing \"{}\"", self.initdb_dir.display()); + + let process = Command::new(&self.initdb) + .arg("-D") + .arg(&self.initdb_dir) + .arg("--username") + .arg("postgres") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await?; + + if !process.status.success() { + error!("{}", String::from_utf8_lossy(&process.stdout)); + error!("{}", String::from_utf8_lossy(&process.stderr)); + return Err(Error::InitDb); + } + + // Configure Postgres. + ConfigParser::load(&self.initdb_dir.join("postgresql.conf")) + .await? + .configure_and_save(self.port) + .await?; + + let child = Command::new(&self.postres) + .arg("-D") + .arg(&self.initdb_dir) + .arg("-k") + .arg(&self.socket_dir) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + + let mut process = PostgresProcessAsync { + child, + notify: self.notify.clone(), + initdb_dir: self.initdb_dir.clone(), + }; + + spawn(async move { + info!("[fdw] postgres process running"); + + let reader = process + .child + .stderr + .take() + .map(|stdout| BufReader::new(stdout)); + + let mut reader = if let Some(reader) = reader { + reader + } else { + error!("[fdw] failed to start subprocess: no stderr"); + if let Err(err) = process.stop().await { + error!("[fdw] failed to abort subprocess: {}", err); + } + return; + }; + + loop { + let mut line = String::new(); + select! { + _ = process.notify.notified() => { + if let Err(err) = process.stop().await { + error!("[fdw] shutdown error: {}", err); + } + break; + } + + _ = process.child.wait() => { + error!("[fdw] postgres shut down unexpectedly"); + } + + res = reader.read_line(&mut line) => { + if let Err(err) = res { + error!("[fdw] process error: {}", err); + break; + } + + if !line.is_empty() { + let line = LOG_PREFIX.replace(&line, ""); + info!("[fdw/subprocess] {}", line.trim()); + } + + } + } + } + + process.notify.notify_one(); + }); + + Ok(()) + } + + /// Create server connection. + pub(crate) async fn connection(&self) -> Result { + let address = self.address(); + + let server = + Server::connect(&address, ServerOptions::default(), ConnectReason::Other).await?; + + Ok(server) + } + + fn address(&self) -> Address { + Address { + host: "127.0.0.1".into(), + port: 6000, + user: "postgres".into(), + database_name: "postgres".into(), + ..Default::default() + } + } + + /// Connection pool that connections directly to the server. + pub(crate) fn pool(&self) -> Pool { + Pool::new(&PoolConfig { + address: self.address(), + config: Config { + inner: pgdog_stats::Config { + max: 10, + ..Default::default() + }, + }, + }) + } + + /// Wait until process is ready and accepting connections. + pub(crate) async fn wait_ready(&self) -> Result<(), Error> { + timeout(Duration::from_millis(5000), self.wait_ready_internal()).await?; + + Ok(()) + } + + async fn wait_ready_internal(&self) { + while let Err(_) = self.connection().await { + sleep(Duration::from_millis(100)).await; + continue; + } + } + + pub(crate) async fn stop(&self) { + self.notify.notify_one(); + self.notify.notified().await; + } +} + +#[cfg(test)] +mod test { + + use super::*; + use tempfile::TempDir; + + #[tokio::test] + async fn test_postgres_process() { + crate::logger(); + + let initdb = TempDir::new().unwrap(); + + let mut process = PostgresProcess::new(initdb.path(), 6000); + + process.launch().await.unwrap(); + process.wait_ready().await.unwrap(); + let mut server = process.connection().await.unwrap(); + server.execute("SELECT 1").await.unwrap(); + server + .execute("CREATE TABLE test (id BIGINT)") + .await + .unwrap(); + server.execute("INSERT INTO test VALUES (1)").await.unwrap(); + server.execute("CHECKPOINT").await.unwrap(); + process.stop().await; + } +} diff --git a/pgdog/src/backend/mod.rs b/pgdog/src/backend/mod.rs index 2bc40068..80bd6935 100644 --- a/pgdog/src/backend/mod.rs +++ b/pgdog/src/backend/mod.rs @@ -4,6 +4,7 @@ pub mod connect_reason; pub mod databases; pub mod disconnect_reason; pub mod error; +pub mod fdw; pub mod maintenance_mode; pub mod pool; pub mod prepared_statements; From 72a24857485c0f25ed99f73c309ad23c48e09da2 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Sat, 31 Jan 2026 17:15:13 -0800 Subject: [PATCH 03/29] intersting --- pgdog/Cargo.toml | 2 +- pgdog/src/backend/fdw/mod.rs | 4 +-- pgdog/src/backend/fdw/postgres.rs | 30 ++++++++++--------- .../{config_parser.rs => postgres_config.rs} | 7 ++--- pgdog/src/backend/schema/postgres_fdw/mod.rs | 2 ++ .../src/backend/schema/postgres_fdw/schema.rs | 26 ++++++++++++++-- .../backend/schema/postgres_fdw/statement.rs | 2 +- 7 files changed, 49 insertions(+), 24 deletions(-) rename pgdog/src/backend/fdw/{config_parser.rs => postgres_config.rs} (87%) diff --git a/pgdog/Cargo.toml b/pgdog/Cargo.toml index 15903da3..c58cc933 100644 --- a/pgdog/Cargo.toml +++ b/pgdog/Cargo.toml @@ -63,6 +63,7 @@ hickory-resolver = "0.25.2" lazy_static = "1" dashmap = "6" derive_builder = "0.20.2" +tempfile = "3.23.0" nix = { version = "0.31", features = ["signal"] } pgdog-config = { path = "../pgdog-config" } pgdog-vector = { path = "../pgdog-vector" } @@ -77,5 +78,4 @@ tikv-jemallocator = "0.6" cc = "1" [dev-dependencies] -tempfile = "3.23.0" stats_alloc = "0.1.10" diff --git a/pgdog/src/backend/fdw/mod.rs b/pgdog/src/backend/fdw/mod.rs index 1c801bf9..a4f7e082 100644 --- a/pgdog/src/backend/fdw/mod.rs +++ b/pgdog/src/backend/fdw/mod.rs @@ -1,6 +1,6 @@ -pub mod config_parser; pub mod error; pub mod postgres; +pub mod postgres_config; -pub(crate) use config_parser::ConfigParser; pub use error::Error; +pub(crate) use postgres_config::PostgresConfig; diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index c25e4edc..b009d197 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -13,6 +13,7 @@ use nix::{ use once_cell::sync::Lazy; use regex::Regex; +use tempfile::TempDir; use tokio::{ fs::remove_dir_all, io::{AsyncBufReadExt, BufReader}, @@ -28,7 +29,7 @@ use crate::backend::{ ConnectReason, Pool, Server, ServerOptions, }; -use super::{ConfigParser, Error}; +use super::{Error, PostgresConfig}; static LOG_PREFIX: Lazy = Lazy::new(|| Regex::new(r"^(LOG|WARNING|ERROR|FATAL|PANIC|DEBUG\d?|INFO|NOTICE):\s+").unwrap()); @@ -61,27 +62,32 @@ impl PostgresProcessAsync { } } +#[derive(Debug, Clone)] pub(crate) struct PostgresProcess { postres: PathBuf, initdb: PathBuf, initdb_dir: PathBuf, - socket_dir: PathBuf, notify: Arc, port: u16, } impl PostgresProcess { - pub(crate) fn new(initdb_path: impl AsRef, port: u16) -> Self { + pub(crate) fn new(initdb_path: Option<&Path>, port: u16) -> Result { let notify = Arc::new(Notify::new()); - Self { + let initdb_path = if let Some(path) = initdb_path { + path.to_owned() + } else { + TempDir::new()?.keep() + }; + + Ok(Self { postres: PathBuf::from("postgres"), initdb: PathBuf::from("initdb"), - socket_dir: PathBuf::from("/tmp"), - initdb_dir: initdb_path.as_ref().to_owned(), + initdb_dir: initdb_path, notify, port, - } + }) } /// Setup and launch Postgres process. @@ -105,7 +111,7 @@ impl PostgresProcess { } // Configure Postgres. - ConfigParser::load(&self.initdb_dir.join("postgresql.conf")) + PostgresConfig::new(&self.initdb_dir.join("postgresql.conf")) .await? .configure_and_save(self.port) .await?; @@ -114,7 +120,7 @@ impl PostgresProcess { .arg("-D") .arg(&self.initdb_dir) .arg("-k") - .arg(&self.socket_dir) + .arg(&self.initdb_dir) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .spawn()?; @@ -168,7 +174,6 @@ impl PostgresProcess { let line = LOG_PREFIX.replace(&line, ""); info!("[fdw/subprocess] {}", line.trim()); } - } } } @@ -236,15 +241,12 @@ impl PostgresProcess { mod test { use super::*; - use tempfile::TempDir; #[tokio::test] async fn test_postgres_process() { crate::logger(); - let initdb = TempDir::new().unwrap(); - - let mut process = PostgresProcess::new(initdb.path(), 6000); + let mut process = PostgresProcess::new(None, 6000).unwrap(); process.launch().await.unwrap(); process.wait_ready().await.unwrap(); diff --git a/pgdog/src/backend/fdw/config_parser.rs b/pgdog/src/backend/fdw/postgres_config.rs similarity index 87% rename from pgdog/src/backend/fdw/config_parser.rs rename to pgdog/src/backend/fdw/postgres_config.rs index 742494ed..ffdaa514 100644 --- a/pgdog/src/backend/fdw/config_parser.rs +++ b/pgdog/src/backend/fdw/postgres_config.rs @@ -7,16 +7,15 @@ use tokio::{ use super::Error; #[derive(Debug, Clone)] -pub(crate) struct ConfigParser { +pub(crate) struct PostgresConfig { path: PathBuf, content: String, } -impl ConfigParser { +impl PostgresConfig { /// Load configuration from path. - pub(crate) async fn load(path: impl AsRef) -> Result { + pub(crate) async fn new(path: impl AsRef) -> Result { let path = PathBuf::from(path.as_ref()); - // let content = read_to_string(&path).await?; Ok(Self { path, diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs index 3557181a..d865016a 100644 --- a/pgdog/src/backend/schema/postgres_fdw/mod.rs +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -5,3 +5,5 @@ mod statement; pub use schema::{ForeignTableColumn, FOREIGN_TABLE_SCHEMA}; pub use statement::{create_foreign_table, ForeignTableBuilder, PartitionStrategy}; + +use statement::quote_identifier; diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 5016e79d..71e5d619 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -1,8 +1,11 @@ //! Foreign table schema query and data structures. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; -use crate::{backend::Server, net::messages::DataRow}; +use crate::{ + backend::{schema::postgres_fdw::create_foreign_table, Server}, + net::messages::DataRow, +}; /// Query to fetch table and column information needed for CREATE FOREIGN TABLE statements. pub static FOREIGN_TABLE_SCHEMA: &str = include_str!("postgres_fdw.sql"); @@ -53,6 +56,25 @@ impl ForeignTableSchema { tables: ForeignTableColumn::load(server).await?, }) } + + pub(crate) async fn setup(&self, server: &mut Server) -> Result<(), super::super::Error> { + let mut schemas = HashSet::new(); + + for ((schema, table), columns) in &self.tables { + if !schemas.contains(schema) { + server + .execute(&format!( + "CREATE SCHEMA IF NOT EXISTS {}", + super::quote_identifier(&schema) + )) + .await?; + schemas.insert(schema.clone()); + } + + // let table = create_foreign_table(&columns, server_name, sharded_tables); + } + Ok(()) + } } impl ForeignTableColumn { diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs index bc2dc1d5..aeaaf5da 100644 --- a/pgdog/src/backend/schema/postgres_fdw/statement.rs +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -37,7 +37,7 @@ impl PartitionStrategy { } /// Quote an identifier if needed (simple Postgres-style quoting). -fn quote_identifier(name: &str) -> String { +pub(super) fn quote_identifier(name: &str) -> String { let needs_quoting = name.is_empty() || !name.starts_with(|c: char| c.is_ascii_lowercase() || c == '_') || name.starts_with('_') && name.chars().nth(1).is_some_and(|c| c.is_ascii_digit()) From a14fdf5ab8dcd60a715419db9556986a741ce2cb Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Sun, 1 Feb 2026 10:53:20 -0800 Subject: [PATCH 04/29] save --- pgdog/src/backend/fdw/error.rs | 3 + pgdog/src/backend/fdw/postgres.rs | 120 ++++++++++++++---- pgdog/src/backend/fdw/postgres_config.rs | 3 + pgdog/src/backend/pool/cluster.rs | 13 ++ pgdog/src/backend/pool/lb/mod.rs | 2 +- pgdog/src/backend/schema/postgres_fdw/mod.rs | 2 +- .../backend/schema/postgres_fdw/statement.rs | 5 +- 7 files changed, 119 insertions(+), 29 deletions(-) diff --git a/pgdog/src/backend/fdw/error.rs b/pgdog/src/backend/fdw/error.rs index a0805a2b..4a5e659c 100644 --- a/pgdog/src/backend/fdw/error.rs +++ b/pgdog/src/backend/fdw/error.rs @@ -11,6 +11,9 @@ pub enum Error { #[error("backend: {0}")] Backend(#[from] crate::backend::Error), + #[error("pool: {0}")] + Pool(#[from] crate::backend::pool::Error), + #[error("postgres didn't launch in time")] Timeout(#[from] tokio::time::error::Elapsed), diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index b009d197..1db46451 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -1,4 +1,5 @@ use std::{ + collections::HashSet, path::{Path, PathBuf}, process::Stdio, sync::Arc, @@ -12,6 +13,7 @@ use nix::{ }; use once_cell::sync::Lazy; +use pgdog_config::Role; use regex::Regex; use tempfile::TempDir; use tokio::{ @@ -22,11 +24,12 @@ use tokio::{ sync::Notify, time::{sleep, timeout}, }; -use tracing::{error, info}; +use tracing::{error, info, warn}; use crate::backend::{ - pool::{Address, Config, PoolConfig}, - ConnectReason, Pool, Server, ServerOptions, + pool::{Address, Request}, + schema::postgres_fdw::ForeignTableSchema, + Cluster, ConnectReason, Server, ServerOptions, }; use super::{Error, PostgresConfig}; @@ -69,6 +72,8 @@ pub(crate) struct PostgresProcess { initdb_dir: PathBuf, notify: Arc, port: u16, + databases: HashSet, + pid: Option, } impl PostgresProcess { @@ -87,6 +92,8 @@ impl PostgresProcess { initdb_dir: initdb_path, notify, port, + databases: HashSet::new(), + pid: None, }) } @@ -125,6 +132,8 @@ impl PostgresProcess { .stderr(Stdio::piped()) .spawn()?; + self.pid = child.id().map(|pid| pid as i32); + let mut process = PostgresProcessAsync { child, notify: self.notify.clone(), @@ -162,6 +171,7 @@ impl PostgresProcess { _ = process.child.wait() => { error!("[fdw] postgres shut down unexpectedly"); + break; } res = reader.read_line(&mut line) => { @@ -172,7 +182,7 @@ impl PostgresProcess { if !line.is_empty() { let line = LOG_PREFIX.replace(&line, ""); - info!("[fdw/subprocess] {}", line.trim()); + info!("[fdw::subprocess] {}", line.trim()); } } } @@ -184,9 +194,55 @@ impl PostgresProcess { Ok(()) } + /// Setup the Postgres database for usage with cluster. + pub(crate) async fn configure(&mut self, cluster: &Cluster) -> Result<(), Error> { + let database = cluster.identifier().database.clone(); + let mut connection = self.admin_connection().await?; + + connection + .execute("CREATE EXTENSION IF NOT EXISTS postgres_fdw") + .await?; + + if !self.databases.contains(&database) { + connection + .execute(format!(r#"CREATE DATABASE "{}""#, database)) + .await?; + for (number, shard) in cluster.shards().iter().enumerate() { + let primary = shard + .pools_with_roles() + .into_iter() + .find(|(role, _)| role == &Role::Primary) + .map(|(_, pool)| pool.addr().clone()); + if let Some(primary) = primary { + connection + .execute(&format!( + r#"CREATE SERVER "shard_{}" + FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (host '{}', port '{}', dbname '{}')"#, + number, primary.host, primary.port, primary.database_name, + )) + .await?; + } + } + let schema = { + let mut server = cluster.primary_or_replica(0, &Request::default()).await?; + ForeignTableSchema::load(&mut server).await? + }; + schema.setup(&mut connection).await?; + self.databases.insert(database); + } + + Ok(()) + } + /// Create server connection. - pub(crate) async fn connection(&self) -> Result { - let address = self.address(); + pub(crate) async fn admin_connection(&self) -> Result { + self.connection("postgres", "postgres").await + } + + /// Get a connection with the user and database. + pub(crate) async fn connection(&self, user: &str, database: &str) -> Result { + let address = self.address(user, database); let server = Server::connect(&address, ServerOptions::default(), ConnectReason::Other).await?; @@ -194,29 +250,16 @@ impl PostgresProcess { Ok(server) } - fn address(&self) -> Address { + fn address(&self, user: &str, database: &str) -> Address { Address { host: "127.0.0.1".into(), - port: 6000, - user: "postgres".into(), - database_name: "postgres".into(), + port: self.port, + user: user.into(), + database_name: database.into(), ..Default::default() } } - /// Connection pool that connections directly to the server. - pub(crate) fn pool(&self) -> Pool { - Pool::new(&PoolConfig { - address: self.address(), - config: Config { - inner: pgdog_stats::Config { - max: 10, - ..Default::default() - }, - }, - }) - } - /// Wait until process is ready and accepting connections. pub(crate) async fn wait_ready(&self) -> Result<(), Error> { timeout(Duration::from_millis(5000), self.wait_ready_internal()).await?; @@ -225,32 +268,57 @@ impl PostgresProcess { } async fn wait_ready_internal(&self) { - while let Err(_) = self.connection().await { + while let Err(_) = self.admin_connection().await { sleep(Duration::from_millis(100)).await; continue; } } - pub(crate) async fn stop(&self) { + pub(crate) async fn stop(&mut self) { self.notify.notify_one(); self.notify.notified().await; + self.pid.take(); + } +} + +impl Drop for PostgresProcess { + fn drop(&mut self) { + if let Some(pid) = self.pid.take() { + warn!("[fdw] dirty shutdown initiated"); + + #[cfg(unix)] + { + if let Err(err) = kill(Pid::from_raw(pid), Signal::SIGKILL) { + error!("[fdw] dirty shutdown failed: {}", err); + } + + if let Err(err) = std::fs::remove_dir_all(&self.initdb_dir) { + error!("[fdw] dirty shutdown clean-up error: {}", err); + } + } + } } } #[cfg(test)] mod test { + use crate::config::config; + use super::*; #[tokio::test] async fn test_postgres_process() { crate::logger(); + let cluster = Cluster::new_test(&config()); + cluster.launch(); let mut process = PostgresProcess::new(None, 6000).unwrap(); process.launch().await.unwrap(); process.wait_ready().await.unwrap(); - let mut server = process.connection().await.unwrap(); + process.configure(&cluster).await.unwrap(); + let mut server = process.admin_connection().await.unwrap(); server.execute("SELECT 1").await.unwrap(); server .execute("CREATE TABLE test (id BIGINT)") diff --git a/pgdog/src/backend/fdw/postgres_config.rs b/pgdog/src/backend/fdw/postgres_config.rs index ffdaa514..d22132d1 100644 --- a/pgdog/src/backend/fdw/postgres_config.rs +++ b/pgdog/src/backend/fdw/postgres_config.rs @@ -43,6 +43,9 @@ impl PostgresConfig { self.set("port", &port.to_string()); self.set("max_connections", "100"); self.set("log_line_prefix", "''"); + self.set("log_connections", "on"); + self.set("log_disconnections", "on"); + self.set("log_statement", "ddl"); self.save().await?; diff --git a/pgdog/src/backend/pool/cluster.rs b/pgdog/src/backend/pool/cluster.rs index 33c0798a..b4b1a62b 100644 --- a/pgdog/src/backend/pool/cluster.rs +++ b/pgdog/src/backend/pool/cluster.rs @@ -312,6 +312,19 @@ impl Cluster { shard.replica(request).await } + /// Get a connection to either a primary or a replica. + pub async fn primary_or_replica( + &self, + shard: usize, + request: &Request, + ) -> Result { + self.shards + .get(shard) + .ok_or(Error::NoShard(shard))? + .primary_or_replica(request) + .await + } + /// The two clusters have the same databases. pub(crate) fn can_move_conns_to(&self, other: &Cluster) -> bool { self.shards.len() == other.shards.len() diff --git a/pgdog/src/backend/pool/lb/mod.rs b/pgdog/src/backend/pool/lb/mod.rs index 8e030384..f08cee79 100644 --- a/pgdog/src/backend/pool/lb/mod.rs +++ b/pgdog/src/backend/pool/lb/mod.rs @@ -76,7 +76,7 @@ pub struct LoadBalancer { pub(super) round_robin: Arc, /// Chosen load balancing strategy. pub(super) lb_strategy: LoadBalancingStrategy, - /// Maintenance. notification. + /// Maintenance notification. pub(super) maintenance: Arc, /// Read/write split. pub(super) rw_split: ReadWriteSplit, diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs index d865016a..a68f84d9 100644 --- a/pgdog/src/backend/schema/postgres_fdw/mod.rs +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -3,7 +3,7 @@ mod schema; mod statement; -pub use schema::{ForeignTableColumn, FOREIGN_TABLE_SCHEMA}; +pub use schema::{ForeignTableColumn, ForeignTableSchema, FOREIGN_TABLE_SCHEMA}; pub use statement::{create_foreign_table, ForeignTableBuilder, PartitionStrategy}; use statement::quote_identifier; diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs index aeaaf5da..9492c498 100644 --- a/pgdog/src/backend/schema/postgres_fdw/statement.rs +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -37,7 +37,7 @@ impl PartitionStrategy { } /// Quote an identifier if needed (simple Postgres-style quoting). -pub(super) fn quote_identifier(name: &str) -> String { +pub(crate) fn quote_identifier(name: &str) -> String { let needs_quoting = name.is_empty() || !name.starts_with(|c: char| c.is_ascii_lowercase() || c == '_') || name.starts_with('_') && name.chars().nth(1).is_some_and(|c| c.is_ascii_digit()) @@ -211,6 +211,9 @@ impl<'a> ForeignTableBuilder<'a> { /// All columns must belong to the same table. The server_name is the foreign server /// to reference in the statement. If the table is found in sharded_tables configuration, /// adds the appropriate PARTITION BY clause (HASH, LIST, or RANGE). +/// +/// TODO: handle partitioned tables by creating the partitions +/// and sharding the child tables using our partition algorithm. pub fn create_foreign_table( columns: &[ForeignTableColumn], server_name: &str, From cf7497b2c0f8ef296e9fe5f0ce60cd47bac442b6 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Sun, 1 Feb 2026 15:45:20 -0800 Subject: [PATCH 05/29] save --- pgdog-config/src/sharding.rs | 9 + pgdog/src/backend/error.rs | 3 + pgdog/src/backend/fdw/error.rs | 3 + pgdog/src/backend/fdw/postgres.rs | 183 +++++++-- pgdog/src/backend/fdw/postgres_config.rs | 10 +- pgdog/src/backend/pool/cluster.rs | 3 - .../src/backend/schema/postgres_fdw/error.rs | 13 + pgdog/src/backend/schema/postgres_fdw/mod.rs | 4 +- .../src/backend/schema/postgres_fdw/schema.rs | 25 +- .../backend/schema/postgres_fdw/statement.rs | 384 ++++++++++++------ 10 files changed, 468 insertions(+), 169 deletions(-) create mode 100644 pgdog/src/backend/schema/postgres_fdw/error.rs diff --git a/pgdog-config/src/sharding.rs b/pgdog-config/src/sharding.rs index 0d47d5d9..60eae4ac 100644 --- a/pgdog-config/src/sharding.rs +++ b/pgdog-config/src/sharding.rs @@ -313,6 +313,15 @@ impl ListShards { Ok(None) } } + + /// Get all values that map to a specific shard. + pub fn values_for_shard(&self, shard: usize) -> Vec<&FlexibleType> { + self.mapping + .iter() + .filter(|(_, &s)| s == shard) + .map(|(v, _)| v) + .collect() + } } #[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, Hash, Default)] diff --git a/pgdog/src/backend/error.rs b/pgdog/src/backend/error.rs index f2465198..2cdc0379 100644 --- a/pgdog/src/backend/error.rs +++ b/pgdog/src/backend/error.rs @@ -134,6 +134,9 @@ pub enum Error { #[error("unsupported aggregation {function}: {reason}")] UnsupportedAggregation { function: String, reason: String }, + + #[error("{0}")] + ForeignTable(#[from] crate::backend::schema::postgres_fdw::Error), } impl From for Error { diff --git a/pgdog/src/backend/fdw/error.rs b/pgdog/src/backend/fdw/error.rs index 4a5e659c..b357ace9 100644 --- a/pgdog/src/backend/fdw/error.rs +++ b/pgdog/src/backend/fdw/error.rs @@ -19,4 +19,7 @@ pub enum Error { #[error("nix: {0}")] Nix(#[from] nix::Error), + + #[error("shards don't have the same number of replicas/primary")] + ShardsHostsMismatch, } diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index 1db46451..1d70a317 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -1,5 +1,5 @@ use std::{ - collections::HashSet, + collections::{HashMap, HashSet}, path::{Path, PathBuf}, process::Stdio, sync::Arc, @@ -14,6 +14,7 @@ use nix::{ use once_cell::sync::Lazy; use pgdog_config::Role; +use rand::random_range; use regex::Regex; use tempfile::TempDir; use tokio::{ @@ -28,7 +29,7 @@ use tracing::{error, info, warn}; use crate::backend::{ pool::{Address, Request}, - schema::postgres_fdw::ForeignTableSchema, + schema::postgres_fdw::{quote_identifier, ForeignTableSchema}, Cluster, ConnectReason, Server, ServerOptions, }; @@ -73,6 +74,7 @@ pub(crate) struct PostgresProcess { notify: Arc, port: u16, databases: HashSet, + users: HashSet, pid: Option, } @@ -93,6 +95,7 @@ impl PostgresProcess { notify, port, databases: HashSet::new(), + users: HashSet::new(), pid: None, }) } @@ -194,42 +197,137 @@ impl PostgresProcess { Ok(()) } - /// Setup the Postgres database for usage with cluster. + fn pools_to_databases( + cluster: &Cluster, + shard: usize, + ) -> Result, Error> { + let mut replica = 0; + + let shard = cluster + .shards() + .get(shard) + .ok_or(Error::ShardsHostsMismatch)?; + + Ok(shard + .pools_with_roles() + .iter() + .map(|(role, pool)| { + let database = match role { + Role::Primary => format!("{}_p", cluster.identifier().database), + _ => { + replica += 1; + format!("{}_r{}", cluster.identifier().database, replica) + } + }; + (database, pool.addr().clone()) + }) + .collect()) + } + + async fn setup_databases(&mut self, cluster: &Cluster) -> Result { + let hosts: Vec<_> = cluster + .shards() + .iter() + .map(|shard| { + let mut roles: Vec<_> = shard + .pools_with_roles() + .iter() + .map(|(role, _)| role) + .cloned() + .collect(); + roles.sort(); + roles + }) + .collect(); + let identical = hosts.windows(2).all(|w| w.get(0) == w.get(1)); + if !identical { + return Err(Error::ShardsHostsMismatch); + } + + let mut admin_connection = self.admin_connection().await?; + let mut created = false; + + for (database, _) in Self::pools_to_databases(cluster, 0)? { + if !self.databases.contains(&database) { + admin_connection + .execute(format!( + r#"CREATE DATABASE {}"#, + quote_identifier(&database) + )) + .await?; + created = true; + } + } + + Ok(created) + } + + /// Create the same load-balancing and sharding setup we have in pgdog.toml + /// for this cluster. pub(crate) async fn configure(&mut self, cluster: &Cluster) -> Result<(), Error> { - let database = cluster.identifier().database.clone(); - let mut connection = self.admin_connection().await?; + if !self.setup_databases(cluster).await? { + return Ok(()); + } - connection - .execute("CREATE EXTENSION IF NOT EXISTS postgres_fdw") - .await?; + let sharding_schema = cluster.sharding_schema(); + + let schema = { + // TODO: Double check schemas are identical on all shards. + let shard = random_range(0..sharding_schema.shards); + let mut server = cluster + .primary_or_replica(shard, &Request::default()) + .await?; + ForeignTableSchema::load(&mut server).await? + }; + + // Setup persistent connections. + let mut connections = HashMap::new(); + + // We checked that all shards have the same number of replicas. + let databases: Vec<_> = Self::pools_to_databases(cluster, 0)? + .into_iter() + .map(|(database, _)| database) + .collect(); + + for database in &databases { + let mut connection = self.connection("postgres", database).await?; - if !self.databases.contains(&database) { connection - .execute(format!(r#"CREATE DATABASE "{}""#, database)) + .execute("CREATE EXTENSION IF NOT EXISTS postgres_fdw") .await?; - for (number, shard) in cluster.shards().iter().enumerate() { - let primary = shard - .pools_with_roles() - .into_iter() - .find(|(role, _)| role == &Role::Primary) - .map(|(_, pool)| pool.addr().clone()); - if let Some(primary) = primary { - connection - .execute(&format!( - r#"CREATE SERVER "shard_{}" - FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (host '{}', port '{}', dbname '{}')"#, - number, primary.host, primary.port, primary.database_name, - )) - .await?; - } + + connections.insert(database.clone(), connection); + } + + for (number, _) in cluster.shards().iter().enumerate() { + for (database, address) in Self::pools_to_databases(cluster, number)? { + let connection = connections.get_mut(&database).expect("connection is gone"); + + connection + .execute(format!( + r#"CREATE SERVER IF NOT EXISTS "shard_{}" + FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (host '{}', port '{}', dbname '{}')"#, + number, address.host, address.port, address.database_name, + )) + .await?; + + connection + .execute(format!( + r#" + CREATE USER MAPPING IF NOT EXISTS + FOR postgres + SERVER "shard_{}" + OPTIONS (user '{}', password '{}')"#, + number, address.user, address.password + )) + .await?; } - let schema = { - let mut server = cluster.primary_or_replica(0, &Request::default()).await?; - ForeignTableSchema::load(&mut server).await? - }; - schema.setup(&mut connection).await?; - self.databases.insert(database); + } + + for database in &databases { + let mut connection = connections.get_mut(database).expect("connection is gone"); + schema.setup(&mut connection, &sharding_schema).await?; } Ok(()) @@ -289,11 +387,11 @@ impl Drop for PostgresProcess { #[cfg(unix)] { if let Err(err) = kill(Pid::from_raw(pid), Signal::SIGKILL) { - error!("[fdw] dirty shutdown failed: {}", err); + error!("[fdw] dirty shutdown error: {}", err); } if let Err(err) = std::fs::remove_dir_all(&self.initdb_dir) { - error!("[fdw] dirty shutdown clean-up error: {}", err); + error!("[fdw] dirty shutdown cleanup error: {}", err); } } } @@ -313,13 +411,26 @@ mod test { let cluster = Cluster::new_test(&config()); cluster.launch(); - let mut process = PostgresProcess::new(None, 6000).unwrap(); + let mut process = PostgresProcess::new(None, 45012).unwrap(); process.launch().await.unwrap(); process.wait_ready().await.unwrap(); process.configure(&cluster).await.unwrap(); let mut server = process.admin_connection().await.unwrap(); - server.execute("SELECT 1").await.unwrap(); + let backends = server + .fetch_all::("SELECT backend_type::text FROM pg_stat_activity ORDER BY 1") + .await + .unwrap(); + assert_eq!( + backends, + [ + "background writer", + "checkpointer", + "client backend", + "walwriter" + ] + ); + server .execute("CREATE TABLE test (id BIGINT)") .await diff --git a/pgdog/src/backend/fdw/postgres_config.rs b/pgdog/src/backend/fdw/postgres_config.rs index d22132d1..1cc309be 100644 --- a/pgdog/src/backend/fdw/postgres_config.rs +++ b/pgdog/src/backend/fdw/postgres_config.rs @@ -37,6 +37,7 @@ impl PostgresConfig { /// Configure default settings we need off/on. pub(crate) async fn configure_and_save(&mut self, port: u16) -> Result<(), Error> { + // Disable logical replication workers. self.set("max_logical_replication_workers", "0"); self.set("max_sync_workers_per_subscription", "0"); self.set("max_parallel_apply_workers_per_subscription", "0"); @@ -46,11 +47,16 @@ impl PostgresConfig { self.set("log_connections", "on"); self.set("log_disconnections", "on"); self.set("log_statement", "ddl"); + // Disable autovacuum. This is safe, this database doesn't write anything locally. + self.set("autovacuum", "off"); + // Make the background writer do nothing. + self.set("bgwriter_lru_maxpages", "0"); + self.set("bgwriter_delay", "10s"); + // Disable async io workers. + self.set("io_method", "sync"); self.save().await?; - println!("{}", read_to_string(&self.path).await.unwrap()); - Ok(()) } } diff --git a/pgdog/src/backend/pool/cluster.rs b/pgdog/src/backend/pool/cluster.rs index b4b1a62b..b58d0694 100644 --- a/pgdog/src/backend/pool/cluster.rs +++ b/pgdog/src/backend/pool/cluster.rs @@ -692,10 +692,7 @@ mod test { name: Some("sharded".into()), column: "id".into(), primary: true, - centroids: vec![], data_type: DataType::Bigint, - centroids_path: None, - centroid_probes: 1, hasher: Hasher::Postgres, ..Default::default() }], diff --git a/pgdog/src/backend/schema/postgres_fdw/error.rs b/pgdog/src/backend/schema/postgres_fdw/error.rs new file mode 100644 index 00000000..fca71e70 --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/error.rs @@ -0,0 +1,13 @@ +//! Errors for foreign table statement generation. + +use std::fmt; +use thiserror::Error; + +/// Errors that can occur when building foreign table statements. +#[derive(Debug, Error)] +pub enum Error { + #[error("no columns provided")] + NoColumns, + #[error("format error: {0}")] + Format(#[from] fmt::Error), +} diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs index a68f84d9..8b25b89e 100644 --- a/pgdog/src/backend/schema/postgres_fdw/mod.rs +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -1,9 +1,11 @@ //! Schema information for creating foreign tables via postgres_fdw. +mod error; mod schema; mod statement; +pub use error::Error; pub use schema::{ForeignTableColumn, ForeignTableSchema, FOREIGN_TABLE_SCHEMA}; pub use statement::{create_foreign_table, ForeignTableBuilder, PartitionStrategy}; -use statement::quote_identifier; +pub(crate) use statement::quote_identifier; diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 71e5d619..761f2648 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -1,9 +1,10 @@ //! Foreign table schema query and data structures. use std::collections::{HashMap, HashSet}; +use tracing::info; use crate::{ - backend::{schema::postgres_fdw::create_foreign_table, Server}, + backend::{schema::postgres_fdw::create_foreign_table, Server, ShardingSchema}, net::messages::DataRow, }; @@ -57,8 +58,13 @@ impl ForeignTableSchema { }) } - pub(crate) async fn setup(&self, server: &mut Server) -> Result<(), super::super::Error> { + pub(crate) async fn setup( + &self, + server: &mut Server, + sharding_schema: &ShardingSchema, + ) -> Result<(), super::super::Error> { let mut schemas = HashSet::new(); + let mut tables = HashSet::new(); for ((schema, table), columns) in &self.tables { if !schemas.contains(schema) { @@ -71,18 +77,21 @@ impl ForeignTableSchema { schemas.insert(schema.clone()); } - // let table = create_foreign_table(&columns, server_name, sharded_tables); + let dedup = (schema.clone(), table.clone()); + if !tables.contains(&dedup) { + let statements = create_foreign_table(columns, sharding_schema)?; + for sql in statements { + info!("[fdw::setup] {} [{}]", sql, server.addr()); + server.execute(&sql).await?; + } + tables.insert(dedup); + } } Ok(()) } } impl ForeignTableColumn { - /// Check if this column is a stored generated column. - pub(super) fn is_generated(&self) -> bool { - self.generated == "s" - } - /// Check if this column has a collation. pub(super) fn has_collation(&self) -> bool { !self.collation_name.is_empty() && !self.collation_schema.is_empty() diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs index 9492c498..8d1b34aa 100644 --- a/pgdog/src/backend/schema/postgres_fdw/statement.rs +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -2,11 +2,23 @@ use std::fmt::Write; -use crate::backend::replication::ShardedTables; -use crate::config::ShardedTable; +use rand::Rng; + +use crate::backend::pool::ShardingSchema; +use crate::config::{FlexibleType, ShardedTable}; +use crate::frontend::router::parser::Column; use crate::frontend::router::sharding::Mapping; -use super::ForeignTableColumn; +use super::{Error, ForeignTableColumn}; + +/// Format a FlexibleType as a SQL literal. +fn flexible_type_to_sql(value: &FlexibleType) -> String { + match value { + FlexibleType::Integer(i) => i.to_string(), + FlexibleType::Uuid(u) => format!("'{}'", u), + FlexibleType::String(s) => format!("'{}'", s.replace('\'', "''")), + } +} /// Partition strategy for a sharded table. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -48,7 +60,7 @@ pub(crate) fn quote_identifier(name: &str) -> String { if needs_quoting { format!("\"{}\"", name.replace('"', "\"\"")) } else { - name.to_string() + format!(r#""{}""#, name.to_string()) } } @@ -57,24 +69,23 @@ fn escape_literal(s: &str) -> String { format!("'{}'", s.replace('\'', "''")) } +/// Format a fully qualified table name (schema.table). +fn qualified_table(schema: &str, table: &str) -> String { + format!("{}.{}", quote_identifier(schema), quote_identifier(table)) +} + /// Builder for CREATE FOREIGN TABLE statements. pub struct ForeignTableBuilder<'a> { columns: &'a [ForeignTableColumn], - server_name: &'a str, - sharded_tables: &'a ShardedTables, + sharding_schema: &'a ShardingSchema, } impl<'a> ForeignTableBuilder<'a> { /// Create a new builder with required parameters. - pub fn new( - columns: &'a [ForeignTableColumn], - server_name: &'a str, - sharded_tables: &'a ShardedTables, - ) -> Self { + pub fn new(columns: &'a [ForeignTableColumn], sharding_schema: &'a ShardingSchema) -> Self { Self { columns, - server_name, - sharded_tables, + sharding_schema, } } @@ -84,49 +95,88 @@ impl<'a> ForeignTableBuilder<'a> { let table_name = &first.table_name; let schema_name = &first.schema_name; - for candidate in self.sharded_tables.tables() { - // Match table name if specified - if let Some(ref name) = candidate.name { - if name != table_name { - continue; - } + for col in self.columns { + let column = Column { + name: &col.column_name, + table: Some(table_name.as_str()), + schema: Some(schema_name.as_str()), + }; + + if let Some(sharded) = self.sharding_schema.tables().get_table(column) { + return Some(sharded); } + } - // Match schema if specified - if let Some(ref schema) = candidate.schema { - if schema != schema_name { - continue; - } + None + } + + /// Build column definitions SQL fragment (shared between parent and foreign tables). + fn build_columns(&self) -> Result { + let mut sql = String::new(); + let mut first_col = true; + + for col in self.columns { + if col.column_name.is_empty() { + continue; } - // Check if the shard column exists in this table - let has_column = self - .columns - .iter() - .any(|col| col.column_name == candidate.column); + if first_col { + first_col = false; + } else { + sql.push_str(",\n"); + } + + write!( + sql, + " {} {}", + quote_identifier(&col.column_name), + col.column_type + )?; + + if col.has_collation() { + write!( + sql, + " COLLATE {}.{}", + quote_identifier(&col.collation_schema), + quote_identifier(&col.collation_name) + )?; + } - if has_column { - return Some(candidate); + if col.is_not_null { + sql.push_str(" NOT NULL"); } } - None + Ok(sql) } - /// Build the CREATE FOREIGN TABLE statement. - pub fn build(self) -> Option { - let first = self.columns.first()?; + /// Build the CREATE TABLE / CREATE FOREIGN TABLE statement(s). + pub fn build(self) -> Result, Error> { + let first = self.columns.first().ok_or(Error::NoColumns)?; let schema_name = &first.schema_name; let table_name = &first.table_name; + if let Some(sharded) = self.find_sharded_config() { + self.build_sharded(table_name, schema_name, sharded) + } else { + self.build_foreign_table(table_name, schema_name) + } + } + + /// Build a simple foreign table (non-sharded). + fn build_foreign_table( + &self, + table_name: &str, + schema_name: &str, + ) -> Result, Error> { let mut sql = String::new(); writeln!( sql, "CREATE FOREIGN TABLE {} (", - quote_identifier(table_name) - ) - .ok()?; + qualified_table(schema_name, table_name) + )?; + // Column definitions with OPTIONS for foreign tables let mut first_col = true; for col in self.columns { if col.column_name.is_empty() { @@ -144,15 +194,13 @@ impl<'a> ForeignTableBuilder<'a> { " {} {}", quote_identifier(&col.column_name), col.column_type - ) - .ok()?; + )?; write!( sql, " OPTIONS (column_name {})", escape_literal(&col.column_name) - ) - .ok()?; + )?; if col.has_collation() { write!( @@ -160,16 +208,7 @@ impl<'a> ForeignTableBuilder<'a> { " COLLATE {}.{}", quote_identifier(&col.collation_schema), quote_identifier(&col.collation_name) - ) - .ok()?; - } - - if !col.column_default.is_empty() { - if col.is_generated() { - write!(sql, " GENERATED ALWAYS AS ({}) STORED", col.column_default).ok()?; - } else { - write!(sql, " DEFAULT {}", col.column_default).ok()?; - } + )?; } if col.is_not_null { @@ -180,46 +219,121 @@ impl<'a> ForeignTableBuilder<'a> { sql.push('\n'); sql.push(')'); - // Add PARTITION BY clause if table is sharded - if let Some(sharded) = self.find_sharded_config() { - let strategy = PartitionStrategy::from_sharded_table(sharded); - write!( - sql, - " PARTITION BY {} ({})", - strategy.as_sql(), - quote_identifier(&sharded.column) - ) - .ok()?; - } - - // Add SERVER and OPTIONS + let shard = rand::rng().random_range(0..self.sharding_schema.shards.max(1)); write!( sql, - "\nSERVER {}\nOPTIONS (schema_name {}, table_name {})", - quote_identifier(self.server_name), + "\nSERVER shard_{}\nOPTIONS (schema_name {}, table_name {})", + shard, escape_literal(schema_name), escape_literal(table_name) - ) - .ok()?; + )?; - Some(sql) + Ok(vec![sql]) + } + + /// Build a sharded table: parent table + foreign table partitions. + fn build_sharded( + &self, + table_name: &str, + schema_name: &str, + sharded: &ShardedTable, + ) -> Result, Error> { + let strategy = PartitionStrategy::from_sharded_table(sharded); + let mut statements = Vec::new(); + + // Create parent table with PARTITION BY + let mut parent = String::new(); + let qualified_name = qualified_table(schema_name, table_name); + writeln!(parent, "CREATE TABLE {} (", qualified_name)?; + parent.push_str(&self.build_columns()?); + parent.push('\n'); + write!( + parent, + ") PARTITION BY {} ({})", + strategy.as_sql(), + quote_identifier(&sharded.column) + )?; + statements.push(parent); + + // Create foreign table partitions for each shard + for shard in 0..self.sharding_schema.shards { + let mut partition = String::new(); + let partition_table_name = format!("{}_shard_{}", table_name, shard); + let qualified_partition = qualified_table(schema_name, &partition_table_name); + let server_name = format!("shard_{}", shard); + + write!( + partition, + "CREATE FOREIGN TABLE {} PARTITION OF {} ", + qualified_partition, qualified_name + )?; + + // Partition bounds + match &sharded.mapping { + None => { + // Hash partitioning + write!( + partition, + "FOR VALUES WITH (MODULUS {}, REMAINDER {})", + self.sharding_schema.shards, shard + )?; + } + Some(Mapping::List(list_shards)) => { + let values = list_shards.values_for_shard(shard); + if values.is_empty() { + write!(partition, "DEFAULT")?; + } else { + let values_sql: Vec<_> = + values.iter().map(|v| flexible_type_to_sql(v)).collect(); + write!(partition, "FOR VALUES IN ({})", values_sql.join(", "))?; + } + } + Some(Mapping::Range(ranges)) => { + if let Some(range) = ranges.iter().find(|r| r.shard == shard) { + let start = range + .start + .as_ref() + .map(flexible_type_to_sql) + .unwrap_or_else(|| "MINVALUE".to_string()); + let end = range + .end + .as_ref() + .map(flexible_type_to_sql) + .unwrap_or_else(|| "MAXVALUE".to_string()); + write!(partition, "FOR VALUES FROM ({}) TO ({})", start, end)?; + } else { + write!(partition, "DEFAULT")?; + } + } + } + + write!( + partition, + "\nSERVER {}\nOPTIONS (schema_name {}, table_name {})", + quote_identifier(&server_name), + escape_literal(schema_name), + escape_literal(table_name) + )?; + + statements.push(partition); + } + + Ok(statements) } } -/// Generate a CREATE FOREIGN TABLE statement from column definitions. +/// Generate CREATE FOREIGN TABLE statements from column definitions. /// -/// All columns must belong to the same table. The server_name is the foreign server -/// to reference in the statement. If the table is found in sharded_tables configuration, -/// adds the appropriate PARTITION BY clause (HASH, LIST, or RANGE). +/// All columns must belong to the same table. If the table is found in sharded_tables +/// configuration, creates a partitioned parent table with foreign table partitions +/// for each shard. Server names are generated as `shard_{n}`. /// -/// TODO: handle partitioned tables by creating the partitions -/// and sharding the child tables using our partition algorithm. +/// Returns a list of SQL statements to execute in order. pub fn create_foreign_table( columns: &[ForeignTableColumn], - server_name: &str, - sharded_tables: &ShardedTables, -) -> Option { - ForeignTableBuilder::new(columns, server_name, sharded_tables).build() + sharding_schema: &ShardingSchema, +) -> Result, Error> { + ForeignTableBuilder::new(columns, sharding_schema).build() } #[cfg(test)] @@ -227,6 +341,7 @@ mod test { use std::collections::HashSet; use super::*; + use crate::backend::replication::ShardedTables; use crate::config::{DataType, FlexibleType, ShardedMapping, ShardedMappingKind}; fn test_column(name: &str, col_type: &str) -> ForeignTableColumn { @@ -270,6 +385,16 @@ mod test { } } + fn sharding_schema_with_tables(tables: ShardedTables, shards: usize) -> ShardingSchema { + ShardingSchema { + shards, + tables, + schemas: Default::default(), + rewrite: Default::default(), + query_parser_engine: Default::default(), + } + } + fn list_mapping() -> Mapping { let mapping = ShardedMapping { database: "test".into(), @@ -313,18 +438,18 @@ mod test { }, ]; - let sharded_tables = ShardedTables::default(); - let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + let schema = sharding_schema_with_tables(ShardedTables::default(), 1); + let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(sql.contains("CREATE FOREIGN TABLE")); - assert!(sql.contains("test_table")); + assert_eq!(statements.len(), 1); + let sql = &statements[0]; + assert!(sql.contains("CREATE FOREIGN TABLE public.test_table")); assert!(sql.contains("bigint")); assert!(sql.contains("NOT NULL")); assert!(sql.contains("OPTIONS (column_name 'id')")); assert!(sql.contains("character varying(100)")); - assert!(sql.contains("DEFAULT 'unknown'::character varying")); - assert!(sql.contains("SERVER")); - assert!(sql.contains("remote_server")); + assert!(!sql.contains("DEFAULT")); // Defaults handled by remote table + assert!(sql.contains("SERVER shard_")); assert!(sql.contains("schema_name 'public'")); assert!(!sql.contains("PARTITION BY")); } @@ -339,76 +464,94 @@ mod test { test_column("name", "text"), ]; - let sharded_tables: ShardedTables = - [test_sharded_table("test_table", "id")].as_slice().into(); - - let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); - - assert!(sql.contains("CREATE FOREIGN TABLE")); - assert!(sql.contains("PARTITION BY HASH (id)")); - assert!(sql.contains("SERVER remote_server")); + let tables: ShardedTables = [test_sharded_table("test_table", "id")].as_slice().into(); + let schema = sharding_schema_with_tables(tables, 2); + + let statements = create_foreign_table(&columns, &schema).unwrap(); + + assert_eq!(statements.len(), 3); // parent + 2 partitions + assert!(statements[0].contains("CREATE TABLE public.test_table")); + assert!(statements[0].contains("PARTITION BY HASH (id)")); + assert!(statements[1].contains( + "CREATE FOREIGN TABLE public.test_table_shard_0 PARTITION OF public.test_table" + )); + assert!(statements[1].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 0)")); + assert!(statements[1].contains("SERVER shard_0")); + assert!(statements[2].contains( + "CREATE FOREIGN TABLE public.test_table_shard_1 PARTITION OF public.test_table" + )); + assert!(statements[2].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 1)")); + assert!(statements[2].contains("SERVER shard_1")); } #[test] fn test_create_foreign_table_with_list_sharding() { let columns = vec![test_column("id", "bigint"), test_column("region", "text")]; - let sharded_tables: ShardedTables = [test_sharded_table_with_mapping( + let tables: ShardedTables = [test_sharded_table_with_mapping( "test_table", "region", list_mapping(), )] .as_slice() .into(); + let schema = sharding_schema_with_tables(tables, 1); - let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(sql.contains("PARTITION BY LIST (region)")); + assert!(statements[0].contains("CREATE TABLE public.test_table")); + assert!(statements[0].contains("PARTITION BY LIST (region)")); } #[test] fn test_create_foreign_table_with_range_sharding() { let columns = vec![test_column("id", "bigint"), test_column("name", "text")]; - let sharded_tables: ShardedTables = [test_sharded_table_with_mapping( + let tables: ShardedTables = [test_sharded_table_with_mapping( "test_table", "id", range_mapping(), )] .as_slice() .into(); + let schema = sharding_schema_with_tables(tables, 1); - let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(sql.contains("PARTITION BY RANGE (id)")); + assert!(statements[0].contains("CREATE TABLE public.test_table")); + assert!(statements[0].contains("PARTITION BY RANGE (id)")); } #[test] fn test_create_foreign_table_no_shard_match() { let columns = vec![test_column("id", "bigint"), test_column("name", "text")]; - // Sharded table config for different table - let sharded_tables: ShardedTables = [test_sharded_table("other_table", "user_id")] + let tables: ShardedTables = [test_sharded_table("other_table", "user_id")] .as_slice() .into(); + let schema = sharding_schema_with_tables(tables, 2); - let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(!sql.contains("PARTITION BY")); + assert_eq!(statements.len(), 1); + assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); + assert!(!statements[0].contains("PARTITION BY")); } #[test] fn test_create_foreign_table_column_mismatch() { let columns = vec![test_column("id", "bigint"), test_column("name", "text")]; - // Sharded table matches by name but column doesn't exist - let sharded_tables: ShardedTables = [test_sharded_table("test_table", "user_id")] + let tables: ShardedTables = [test_sharded_table("test_table", "user_id")] .as_slice() .into(); + let schema = sharding_schema_with_tables(tables, 2); - let sql = create_foreign_table(&columns, "remote_server", &sharded_tables).unwrap(); + let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(!sql.contains("PARTITION BY")); + assert_eq!(statements.len(), 1); + assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); + assert!(!statements[0].contains("PARTITION BY")); } #[test] @@ -419,11 +562,13 @@ mod test { ..test_column("total", "numeric") }]; - let sharded_tables = ShardedTables::default(); - let sql = create_foreign_table(&columns, "srv", &sharded_tables).unwrap(); + let schema = sharding_schema_with_tables(ShardedTables::default(), 1); + let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(sql.contains("GENERATED ALWAYS AS ((price * quantity)) STORED")); - assert!(!sql.contains("DEFAULT")); + assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); + // Defaults and generated columns handled by remote table + assert!(!statements[0].contains("GENERATED")); + assert!(!statements[0].contains("DEFAULT")); } #[test] @@ -434,17 +579,18 @@ mod test { ..test_column("title", "text") }]; - let sharded_tables = ShardedTables::default(); - let sql = create_foreign_table(&columns, "srv", &sharded_tables).unwrap(); + let schema = sharding_schema_with_tables(ShardedTables::default(), 1); + let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(sql.contains("COLLATE pg_catalog.\"en_US\"")); + assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); + assert!(statements[0].contains("COLLATE pg_catalog.\"en_US\"")); } #[test] fn test_create_foreign_table_empty_columns() { - let sharded_tables = ShardedTables::default(); - let result = create_foreign_table(&[], "srv", &sharded_tables); - assert!(result.is_none()); + let schema = sharding_schema_with_tables(ShardedTables::default(), 1); + let result = create_foreign_table(&[], &schema); + assert!(result.is_err()); } #[test] From e34589492be3e350e33e05568c6e1a9d6fb80382 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Sun, 1 Feb 2026 23:31:06 -0800 Subject: [PATCH 06/29] save --- Dockerfile | 5 +- pgdog/src/backend/fdw/launcher.rs | 62 ++++++++ pgdog/src/backend/fdw/mod.rs | 3 + pgdog/src/backend/fdw/postgres.rs | 136 ++++++++++++++---- pgdog/src/backend/fdw/postgres_config.rs | 2 +- .../src/backend/schema/postgres_fdw/schema.rs | 4 +- .../backend/schema/postgres_fdw/statement.rs | 39 ++--- 7 files changed, 200 insertions(+), 51 deletions(-) create mode 100644 pgdog/src/backend/fdw/launcher.rs diff --git a/Dockerfile b/Dockerfile index b4ff5a87..eaa89257 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ WORKDIR /build RUN rm /bin/sh && ln -s /bin/bash /bin/sh RUN source ~/.cargo/env && \ if [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then \ - export RUSTFLAGS="-Ctarget-feature=+lse"; \ + export RUSTFLAGS="-Ctarget-feature=+lse"; \ fi && \ cd pgdog && \ cargo build --release @@ -31,7 +31,8 @@ RUN install -d /usr/share/postgresql-common/pgdg && \ . /etc/os-release && \ sh -c "echo 'deb [signed-by=/usr/share/postgresql-common/pgdg/apt.postgresql.org.asc] https://apt.postgresql.org/pub/repos/apt $VERSION_CODENAME-pgdg main' > /etc/apt/sources.list.d/pgdg.list" -RUN apt update && apt install -y postgresql-client-${PSQL_VERSION} +RUN apt update && apt install -y postgresql-${PSQL_VERSION} && \ + systemctl disable postgresql COPY --from=builder /build/target/release/pgdog /usr/local/bin/pgdog diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs new file mode 100644 index 00000000..6694c14f --- /dev/null +++ b/pgdog/src/backend/fdw/launcher.rs @@ -0,0 +1,62 @@ +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, +}; + +use super::{Error, PostgresProcess}; +use tokio::{ + select, spawn, + sync::Notify, + time::{sleep, Duration}, +}; +use tracing::error; + +#[derive(Debug, Clone)] +pub(crate) struct PostgresLauncher { + shutdown: Arc, + online: Arc, +} + +impl PostgresLauncher { + pub(crate) async fn spawn(&self) { + let launcher = self.clone(); + + spawn(async move { + loop { + if let Err(err) = launcher.run().await { + error!("[fdw] launcher exited with error: {}", err); + } + + let online = launcher.online.load(Ordering::Relaxed); + if !online { + break; + } else { + sleep(Duration::from_millis(1000)).await; + } + } + }); + } + + pub(crate) fn shutdown(&self) { + self.shutdown.notify_waiters(); + } + + async fn run(&self) -> Result<(), Error> { + let mut process = PostgresProcess::new(None, 45000)?; + let waiter = process.notify(); + + process.launch().await?; + + select! { + _ = self.shutdown.notified() => { + process.stop().await; + } + + _ = waiter.notified() => { + // Unexpected exit. + } + } + + Ok(()) + } +} diff --git a/pgdog/src/backend/fdw/mod.rs b/pgdog/src/backend/fdw/mod.rs index a4f7e082..e9ffbbfc 100644 --- a/pgdog/src/backend/fdw/mod.rs +++ b/pgdog/src/backend/fdw/mod.rs @@ -1,6 +1,9 @@ pub mod error; +pub mod launcher; pub mod postgres; pub mod postgres_config; pub use error::Error; +pub(crate) use launcher::PostgresLauncher; +pub(crate) use postgres::PostgresProcess; pub(crate) use postgres_config::PostgresConfig; diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index 1d70a317..6e25319c 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -28,9 +28,9 @@ use tokio::{ use tracing::{error, info, warn}; use crate::backend::{ - pool::{Address, Request}, + pool::{Address, Config, PoolConfig, Request}, schema::postgres_fdw::{quote_identifier, ForeignTableSchema}, - Cluster, ConnectReason, Server, ServerOptions, + Cluster, ConnectReason, Pool, Server, ServerOptions, }; use super::{Error, PostgresConfig}; @@ -191,12 +191,43 @@ impl PostgresProcess { } } - process.notify.notify_one(); + process.notify.notify_waiters(); }); Ok(()) } + /// Access the notify channel. + pub(super) fn notify(&self) -> Arc { + self.notify.clone() + } + + pub(super) fn connection_pools(&self, cluster: &Cluster) -> Result, Error> { + Ok(Self::pools_to_databases(cluster, 0)? + .into_iter() + .map(|(database, _)| { + let address = Address { + host: "127.0.0.1".into(), + port: self.port, + database_name: database.clone(), + password: "".into(), // We use trust + user: cluster.identifier().user.clone(), + database_number: 0, + }; + + Pool::new(&PoolConfig { + address, + config: Config { + inner: pgdog_stats::Config { + max: 10, + ..Default::default() + }, + }, + }) + }) + .collect()) + } + fn pools_to_databases( cluster: &Cluster, shard: usize, @@ -259,15 +290,31 @@ impl PostgresProcess { } } + let user = cluster.identifier().user.clone(); + + if !self.users.contains(&user) { + admin_connection + .execute(format!( + "CREATE USER {} SUPERUSER LOGIN", + quote_identifier(&cluster.identifier().user) + )) + .await?; + self.users.insert(user); + } + Ok(created) } /// Create the same load-balancing and sharding setup we have in pgdog.toml /// for this cluster. pub(crate) async fn configure(&mut self, cluster: &Cluster) -> Result<(), Error> { - if !self.setup_databases(cluster).await? { - return Ok(()); - } + let new_database = self.setup_databases(cluster).await?; + + info!( + "[fdw] setting up {} (new={})", + cluster.identifier().database, + new_database + ); let sharding_schema = cluster.sharding_schema(); @@ -290,44 +337,60 @@ impl PostgresProcess { .collect(); for database in &databases { - let mut connection = self.connection("postgres", database).await?; + let identifier = (cluster.identifier().user.clone(), database.clone()); + let mut connection = self.connection(&identifier.0, database).await?; - connection - .execute("CREATE EXTENSION IF NOT EXISTS postgres_fdw") - .await?; + if new_database { + connection + .execute("CREATE EXTENSION IF NOT EXISTS postgres_fdw") + .await?; + } - connections.insert(database.clone(), connection); + connections.insert(identifier, connection); } for (number, _) in cluster.shards().iter().enumerate() { for (database, address) in Self::pools_to_databases(cluster, number)? { - let connection = connections.get_mut(&database).expect("connection is gone"); - - connection - .execute(format!( - r#"CREATE SERVER IF NOT EXISTS "shard_{}" + let identifier = (cluster.identifier().user.clone(), database.clone()); + let connection = connections + .get_mut(&identifier) + .expect("connection is gone"); + + if new_database { + connection + .execute(format!( + r#"CREATE SERVER IF NOT EXISTS "shard_{}" FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '{}', port '{}', dbname '{}')"#, - number, address.host, address.port, address.database_name, - )) - .await?; + number, address.host, address.port, address.database_name, + )) + .await?; + } connection .execute(format!( r#" CREATE USER MAPPING IF NOT EXISTS - FOR postgres + FOR {} SERVER "shard_{}" OPTIONS (user '{}', password '{}')"#, - number, address.user, address.password + quote_identifier(&identifier.0), + number, + address.user, + address.password )) .await?; } } - for database in &databases { - let mut connection = connections.get_mut(database).expect("connection is gone"); - schema.setup(&mut connection, &sharding_schema).await?; + if new_database { + for database in &databases { + let identifier = (cluster.identifier().user.clone(), database.clone()); + let mut connection = connections + .get_mut(&identifier) + .expect("connection is gone"); + schema.setup(&mut connection, &sharding_schema).await?; + } } Ok(()) @@ -411,6 +474,14 @@ mod test { let cluster = Cluster::new_test(&config()); cluster.launch(); + { + let mut primary = cluster.primary(0, &Request::default()).await.unwrap(); + primary + .execute("CREATE TABLE IF NOT EXISTS test_postgres_process (customer_id BIGINT)") + .await + .unwrap(); + } + let mut process = PostgresProcess::new(None, 45012).unwrap(); process.launch().await.unwrap(); @@ -421,6 +492,7 @@ mod test { .fetch_all::("SELECT backend_type::text FROM pg_stat_activity ORDER BY 1") .await .unwrap(); + assert_eq!( backends, [ @@ -431,12 +503,22 @@ mod test { ] ); + let mut server = process.connection("pgdog", "pgdog_p").await.unwrap(); server - .execute("CREATE TABLE test (id BIGINT)") + .execute("SELECT * FROM pgdog.test_postgres_process") .await .unwrap(); - server.execute("INSERT INTO test VALUES (1)").await.unwrap(); - server.execute("CHECKPOINT").await.unwrap(); + process.stop().await; + + { + let mut primary = cluster.primary(0, &Request::default()).await.unwrap(); + primary + .execute("DROP TABLE test_postgres_process") + .await + .unwrap(); + } + + cluster.shutdown(); } } diff --git a/pgdog/src/backend/fdw/postgres_config.rs b/pgdog/src/backend/fdw/postgres_config.rs index 1cc309be..456112ca 100644 --- a/pgdog/src/backend/fdw/postgres_config.rs +++ b/pgdog/src/backend/fdw/postgres_config.rs @@ -46,7 +46,7 @@ impl PostgresConfig { self.set("log_line_prefix", "''"); self.set("log_connections", "on"); self.set("log_disconnections", "on"); - self.set("log_statement", "ddl"); + // self.set("log_statement", "off"); // Disable autovacuum. This is safe, this database doesn't write anything locally. self.set("autovacuum", "off"); // Make the background writer do nothing. diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 761f2648..170751eb 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -1,7 +1,7 @@ //! Foreign table schema query and data structures. use std::collections::{HashMap, HashSet}; -use tracing::info; +use tracing::debug; use crate::{ backend::{schema::postgres_fdw::create_foreign_table, Server, ShardingSchema}, @@ -81,7 +81,7 @@ impl ForeignTableSchema { if !tables.contains(&dedup) { let statements = create_foreign_table(columns, sharding_schema)?; for sql in statements { - info!("[fdw::setup] {} [{}]", sql, server.addr()); + debug!("[fdw::setup] {} [{}]", sql, server.addr()); server.execute(&sql).await?; } tables.insert(dedup); diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs index 8d1b34aa..77117c6f 100644 --- a/pgdog/src/backend/schema/postgres_fdw/statement.rs +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -443,13 +443,13 @@ mod test { assert_eq!(statements.len(), 1); let sql = &statements[0]; - assert!(sql.contains("CREATE FOREIGN TABLE public.test_table")); + assert!(sql.contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); assert!(sql.contains("bigint")); assert!(sql.contains("NOT NULL")); assert!(sql.contains("OPTIONS (column_name 'id')")); assert!(sql.contains("character varying(100)")); assert!(!sql.contains("DEFAULT")); // Defaults handled by remote table - assert!(sql.contains("SERVER shard_")); + assert!(sql.contains("SERVER")); assert!(sql.contains("schema_name 'public'")); assert!(!sql.contains("PARTITION BY")); } @@ -470,18 +470,18 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); assert_eq!(statements.len(), 3); // parent + 2 partitions - assert!(statements[0].contains("CREATE TABLE public.test_table")); - assert!(statements[0].contains("PARTITION BY HASH (id)")); + assert!(statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); + assert!(statements[0].contains(r#"PARTITION BY HASH ("id")"#)); assert!(statements[1].contains( - "CREATE FOREIGN TABLE public.test_table_shard_0 PARTITION OF public.test_table" + r#"CREATE FOREIGN TABLE "public"."test_table_shard_0" PARTITION OF "public"."test_table""# )); assert!(statements[1].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 0)")); - assert!(statements[1].contains("SERVER shard_0")); + assert!(statements[1].contains(r#"SERVER "shard_0""#)); assert!(statements[2].contains( - "CREATE FOREIGN TABLE public.test_table_shard_1 PARTITION OF public.test_table" + r#"CREATE FOREIGN TABLE "public"."test_table_shard_1" PARTITION OF "public"."test_table""# )); assert!(statements[2].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 1)")); - assert!(statements[2].contains("SERVER shard_1")); + assert!(statements[2].contains(r#"SERVER "shard_1""#)); } #[test] @@ -499,8 +499,8 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains("CREATE TABLE public.test_table")); - assert!(statements[0].contains("PARTITION BY LIST (region)")); + assert!(statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); + assert!(statements[0].contains(r#"PARTITION BY LIST ("region")"#)); } #[test] @@ -518,8 +518,8 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains("CREATE TABLE public.test_table")); - assert!(statements[0].contains("PARTITION BY RANGE (id)")); + assert!(statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); + assert!(statements[0].contains(r#"PARTITION BY RANGE ("id")"#)); } #[test] @@ -534,7 +534,7 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); assert_eq!(statements.len(), 1); - assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); + assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); assert!(!statements[0].contains("PARTITION BY")); } @@ -550,7 +550,7 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); assert_eq!(statements.len(), 1); - assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); + assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); assert!(!statements[0].contains("PARTITION BY")); } @@ -565,7 +565,7 @@ mod test { let schema = sharding_schema_with_tables(ShardedTables::default(), 1); let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); + assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); // Defaults and generated columns handled by remote table assert!(!statements[0].contains("GENERATED")); assert!(!statements[0].contains("DEFAULT")); @@ -582,8 +582,8 @@ mod test { let schema = sharding_schema_with_tables(ShardedTables::default(), 1); let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains("CREATE FOREIGN TABLE public.test_table")); - assert!(statements[0].contains("COLLATE pg_catalog.\"en_US\"")); + assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); + assert!(statements[0].contains(r#"COLLATE "pg_catalog"."en_US""#)); } #[test] @@ -595,11 +595,12 @@ mod test { #[test] fn test_quote_identifier() { - assert_eq!(quote_identifier("users"), "users"); + // All identifiers are now quoted + assert_eq!(quote_identifier("users"), "\"users\""); assert_eq!(quote_identifier("my table"), "\"my table\""); assert_eq!(quote_identifier("123abc"), "\"123abc\""); assert_eq!(quote_identifier("has\"quote"), "\"has\"\"quote\""); assert_eq!(quote_identifier("CamelCase"), "\"CamelCase\""); - assert_eq!(quote_identifier("_valid"), "_valid"); + assert_eq!(quote_identifier("_valid"), "\"_valid\""); } } From dbf324226d7c2f6f29eb44431f196eff28218d9b Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Mon, 2 Feb 2026 07:45:10 -0800 Subject: [PATCH 07/29] bins --- pgdog/src/backend/fdw/bins.rs | 55 ++++++++++++++++++++++++ pgdog/src/backend/fdw/error.rs | 8 ++++ pgdog/src/backend/fdw/launcher.rs | 2 +- pgdog/src/backend/fdw/mod.rs | 1 + pgdog/src/backend/fdw/postgres.rs | 22 +++++++--- pgdog/src/backend/fdw/postgres_config.rs | 21 ++++++--- 6 files changed, 96 insertions(+), 13 deletions(-) create mode 100644 pgdog/src/backend/fdw/bins.rs diff --git a/pgdog/src/backend/fdw/bins.rs b/pgdog/src/backend/fdw/bins.rs new file mode 100644 index 00000000..6c6183d8 --- /dev/null +++ b/pgdog/src/backend/fdw/bins.rs @@ -0,0 +1,55 @@ +use std::path::PathBuf; + +use tokio::process::Command; +use tracing::error; + +use super::Error; + +pub(super) struct Bins { + pub(super) postgres: PathBuf, + pub(super) initdb: PathBuf, + pub(super) version: f32, +} + +impl Bins { + pub(super) async fn new() -> Result { + let pg_config = Command::new("pg_config").output().await?; + + if !pg_config.status.success() { + error!( + "[fdw] pg_config: {}", + String::from_utf8_lossy(&pg_config.stderr) + ); + return Err(Error::PgConfig); + } + + let pg_config = String::from_utf8_lossy(&pg_config.stdout); + let mut path = PathBuf::new(); + let mut version = 0.0; + + for line in pg_config.lines() { + if line.starts_with("BINDIR") { + let bin_dir = line.split("BINDIR = ").last().unwrap_or_default().trim(); + path = PathBuf::from(bin_dir); + } + if line.starts_with("VERSION") { + version = line + .split("VERSION = ") + .last() + .unwrap_or_default() + .trim() + .split(" ") + .nth(1) + .unwrap_or_default() + .trim() + .parse()?; + } + } + + Ok(Self { + postgres: path.join("postgres"), + initdb: path.join("initdb"), + version, + }) + } +} diff --git a/pgdog/src/backend/fdw/error.rs b/pgdog/src/backend/fdw/error.rs index b357ace9..0abc5bdc 100644 --- a/pgdog/src/backend/fdw/error.rs +++ b/pgdog/src/backend/fdw/error.rs @@ -1,3 +1,5 @@ +use std::num::ParseFloatError; + use thiserror::Error; #[derive(Debug, Error)] @@ -8,6 +10,9 @@ pub enum Error { #[error("initdb failed")] InitDb, + #[error("pg_config failed")] + PgConfig, + #[error("backend: {0}")] Backend(#[from] crate::backend::Error), @@ -22,4 +27,7 @@ pub enum Error { #[error("shards don't have the same number of replicas/primary")] ShardsHostsMismatch, + + #[error("error parsing postgres version")] + PostgresVersion(#[from] ParseFloatError), } diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs index 6694c14f..4b6dff0f 100644 --- a/pgdog/src/backend/fdw/launcher.rs +++ b/pgdog/src/backend/fdw/launcher.rs @@ -42,7 +42,7 @@ impl PostgresLauncher { } async fn run(&self) -> Result<(), Error> { - let mut process = PostgresProcess::new(None, 45000)?; + let mut process = PostgresProcess::new(None, 45000).await?; let waiter = process.notify(); process.launch().await?; diff --git a/pgdog/src/backend/fdw/mod.rs b/pgdog/src/backend/fdw/mod.rs index e9ffbbfc..1709aafb 100644 --- a/pgdog/src/backend/fdw/mod.rs +++ b/pgdog/src/backend/fdw/mod.rs @@ -1,3 +1,4 @@ +pub mod bins; pub mod error; pub mod launcher; pub mod postgres; diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index 6e25319c..5f0c0a70 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -33,7 +33,7 @@ use crate::backend::{ Cluster, ConnectReason, Pool, Server, ServerOptions, }; -use super::{Error, PostgresConfig}; +use super::{bins::Bins, Error, PostgresConfig}; static LOG_PREFIX: Lazy = Lazy::new(|| Regex::new(r"^(LOG|WARNING|ERROR|FATAL|PANIC|DEBUG\d?|INFO|NOTICE):\s+").unwrap()); @@ -76,10 +76,11 @@ pub(crate) struct PostgresProcess { databases: HashSet, users: HashSet, pid: Option, + version: f32, } impl PostgresProcess { - pub(crate) fn new(initdb_path: Option<&Path>, port: u16) -> Result { + pub(crate) async fn new(initdb_path: Option<&Path>, port: u16) -> Result { let notify = Arc::new(Notify::new()); let initdb_path = if let Some(path) = initdb_path { @@ -88,21 +89,28 @@ impl PostgresProcess { TempDir::new()?.keep() }; + let bins = Bins::new().await?; + Ok(Self { - postres: PathBuf::from("postgres"), - initdb: PathBuf::from("initdb"), + postres: bins.postgres, + initdb: bins.initdb, initdb_dir: initdb_path, notify, port, databases: HashSet::new(), users: HashSet::new(), pid: None, + version: bins.version, }) } /// Setup and launch Postgres process. pub(crate) async fn launch(&mut self) -> Result<(), Error> { - info!("[fdw] initializing \"{}\"", self.initdb_dir.display()); + info!( + "[fdw] initializing \"{}\" (PostgreSQL {})", + self.initdb_dir.display(), + self.version + ); let process = Command::new(&self.initdb) .arg("-D") @@ -123,7 +131,7 @@ impl PostgresProcess { // Configure Postgres. PostgresConfig::new(&self.initdb_dir.join("postgresql.conf")) .await? - .configure_and_save(self.port) + .configure_and_save(self.port, self.version) .await?; let child = Command::new(&self.postres) @@ -482,7 +490,7 @@ mod test { .unwrap(); } - let mut process = PostgresProcess::new(None, 45012).unwrap(); + let mut process = PostgresProcess::new(None, 45012).await.unwrap(); process.launch().await.unwrap(); process.wait_ready().await.unwrap(); diff --git a/pgdog/src/backend/fdw/postgres_config.rs b/pgdog/src/backend/fdw/postgres_config.rs index 456112ca..ac1ee1e5 100644 --- a/pgdog/src/backend/fdw/postgres_config.rs +++ b/pgdog/src/backend/fdw/postgres_config.rs @@ -36,13 +36,21 @@ impl PostgresConfig { } /// Configure default settings we need off/on. - pub(crate) async fn configure_and_save(&mut self, port: u16) -> Result<(), Error> { + pub(crate) async fn configure_and_save( + &mut self, + port: u16, + version: f32, + ) -> Result<(), Error> { + // Make it accessible via psql for debugging. + self.set("listen_addresses", "'0.0.0.0'"); + self.set("port", &port.to_string()); + // Disable logical replication workers. self.set("max_logical_replication_workers", "0"); self.set("max_sync_workers_per_subscription", "0"); self.set("max_parallel_apply_workers_per_subscription", "0"); - self.set("port", &port.to_string()); - self.set("max_connections", "100"); + + self.set("max_connections", "1000"); self.set("log_line_prefix", "''"); self.set("log_connections", "on"); self.set("log_disconnections", "on"); @@ -52,8 +60,11 @@ impl PostgresConfig { // Make the background writer do nothing. self.set("bgwriter_lru_maxpages", "0"); self.set("bgwriter_delay", "10s"); - // Disable async io workers. - self.set("io_method", "sync"); + + if version >= 18.0 { + // Disable async io workers. + self.set("io_method", "sync"); + } self.save().await?; From d8cef39e1c9b3d051cf7bd7ff4ae8d830d461fcf Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Mon, 2 Feb 2026 09:42:45 -0800 Subject: [PATCH 08/29] failing test --- pgdog-config/src/core.rs | 5 +- pgdog-config/src/fdw.rs | 40 ++++++ pgdog-config/src/lib.rs | 2 + pgdog/src/backend/fdw/launcher.rs | 198 ++++++++++++++++++++++++++++-- pgdog/src/backend/fdw/postgres.rs | 33 +++-- 5 files changed, 257 insertions(+), 21 deletions(-) create mode 100644 pgdog-config/src/fdw.rs diff --git a/pgdog-config/src/core.rs b/pgdog-config/src/core.rs index cb7fd7b0..85a5bba0 100644 --- a/pgdog-config/src/core.rs +++ b/pgdog-config/src/core.rs @@ -6,7 +6,7 @@ use tracing::{info, warn}; use crate::sharding::ShardedSchema; use crate::{ - system_catalogs, EnumeratedDatabase, Memory, OmnishardedTable, PassthoughAuth, + system_catalogs, EnumeratedDatabase, Fdw, Memory, OmnishardedTable, PassthoughAuth, PreparedStatements, QueryParserEngine, QueryParserLevel, ReadWriteSplit, RewriteMode, Role, SystemCatalogsBehavior, }; @@ -187,6 +187,9 @@ pub struct Config { /// Memory tweaks #[serde(default)] pub memory: Memory, + + #[serde(default)] + pub fdw: Fdw, } impl Config { diff --git a/pgdog-config/src/fdw.rs b/pgdog-config/src/fdw.rs new file mode 100644 index 00000000..2f94c8b7 --- /dev/null +++ b/pgdog-config/src/fdw.rs @@ -0,0 +1,40 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Copy)] +#[serde(deny_unknown_fields)] +pub struct Fdw { + #[serde(default)] + pub enabled: bool, + + #[serde(default = "default_green_port")] + pub green_port: u16, + + #[serde(default = "default_blue_port")] + pub blue_port: u16, + + #[serde(default = "default_launch_timeout")] + pub launch_timeout: u64, +} + +impl Default for Fdw { + fn default() -> Self { + Self { + enabled: bool::default(), + green_port: default_green_port(), + blue_port: default_blue_port(), + launch_timeout: default_launch_timeout(), + } + } +} + +fn default_green_port() -> u16 { + 6433 +} + +fn default_blue_port() -> u16 { + 6434 +} + +fn default_launch_timeout() -> u64 { + 5_000 +} diff --git a/pgdog-config/src/lib.rs b/pgdog-config/src/lib.rs index a08b302c..be02d553 100644 --- a/pgdog-config/src/lib.rs +++ b/pgdog-config/src/lib.rs @@ -4,6 +4,7 @@ pub mod core; pub mod data_types; pub mod database; pub mod error; +pub mod fdw; pub mod general; pub mod memory; pub mod networking; @@ -24,6 +25,7 @@ pub use database::{ Database, EnumeratedDatabase, LoadBalancingStrategy, ReadWriteSplit, ReadWriteStrategy, Role, }; pub use error::Error; +pub use fdw::Fdw; pub use general::General; pub use memory::*; pub use networking::{MultiTenant, Tcp, TlsVerifyMode}; diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs index 4b6dff0f..a26e724d 100644 --- a/pgdog/src/backend/fdw/launcher.rs +++ b/pgdog/src/backend/fdw/launcher.rs @@ -1,28 +1,86 @@ -use std::sync::{ - atomic::{AtomicBool, Ordering}, - Arc, +use std::{ + ops::Deref, + sync::{ + atomic::{AtomicBool, AtomicU16, Ordering}, + Arc, + }, }; +use crate::config::config; + use super::{Error, PostgresProcess}; +use once_cell::sync::Lazy; use tokio::{ select, spawn, sync::Notify, - time::{sleep, Duration}, + time::{sleep, timeout, Duration}, }; -use tracing::error; +use tracing::{error, info}; + +const LAUNCHER: Lazy = Lazy::new(PostgresLauncher::new); #[derive(Debug, Clone)] -pub(crate) struct PostgresLauncher { - shutdown: Arc, - online: Arc, +pub struct PostgresLauncher { + inner: Arc, +} + +impl Deref for PostgresLauncher { + type Target = Inner; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +#[derive(Debug, Default)] +pub struct Inner { + restart: Notify, + online: AtomicBool, + port: AtomicU16, + ready_signal: Notify, + ready: AtomicBool, } impl PostgresLauncher { - pub(crate) async fn spawn(&self) { + fn new() -> Self { + let fdw = config().config.fdw; + let port = AtomicU16::new(fdw.blue_port); + + let laucher = Self { + inner: Arc::new(Inner { + port, + ..Default::default() + }), + }; + + laucher.spawn(); + + laucher + } + + /// Get the launcher singleton instance. + pub(crate) fn get() -> Self { + LAUNCHER.clone() + } + + fn spawn(&self) { let launcher = self.clone(); spawn(async move { + let online = launcher.online.load(Ordering::Relaxed); + + if !online { + launcher.restart.notified().await; + } + + launcher.online.store(true, Ordering::Relaxed); + loop { + info!( + "[fdw] launching fdw backend on 0.0.0.0:{}", + launcher.port.load(Ordering::Relaxed), + ); + if let Err(err) = launcher.run().await { error!("[fdw] launcher exited with error: {}", err); } @@ -38,18 +96,72 @@ impl PostgresLauncher { } pub(crate) fn shutdown(&self) { - self.shutdown.notify_waiters(); + self.online.store(false, Ordering::Relaxed); + self.ready.store(false, Ordering::Relaxed); + self.restart.notify_waiters(); + } + + /// Trigger blue/green deployment. + pub(crate) fn launch_blue_green(&self) { + let fdw = config().config.fdw; + let port = self.port.load(Ordering::Relaxed); + let port = if port == fdw.blue_port { + fdw.green_port + } else { + fdw.blue_port + }; + + self.port.store(port, Ordering::Relaxed); + self.ready.store(false, Ordering::Relaxed); + self.restart.notify_waiters(); + } + + /// Wait for Postgres to be ready. + pub(crate) async fn wait_ready(&self, launch_timeout: Duration) -> Result<(), Error> { + let ready = self.ready.load(Ordering::Relaxed); + + if ready { + return Ok(()); + } + + let waiter = self.ready_signal.notified(); + let ready = self.ready.load(Ordering::Relaxed); + + if ready { + return Ok(()); + } + + timeout(launch_timeout, waiter).await?; + + Ok(()) + } + + fn mark_ready(&self) { + self.ready.store(true, Ordering::Relaxed); + self.ready_signal.notify_waiters(); } async fn run(&self) -> Result<(), Error> { - let mut process = PostgresProcess::new(None, 45000).await?; + let port = self.port.load(Ordering::Relaxed); + let mut process = PostgresProcess::new(None, port).await?; let waiter = process.notify(); process.launch().await?; + process.wait_ready(Duration::MAX).await?; + + self.mark_ready(); select! { - _ = self.shutdown.notified() => { - process.stop().await; + _ = self.restart.notified() => { + let online = self.online.load(Ordering::Relaxed); + if online { + println!("requesting stop"); + process.request_stop(); + } else { + println!("waiting for stop"); + process.stop_wait().await; + self.mark_ready(); + } } _ = waiter.notified() => { @@ -60,3 +172,63 @@ impl PostgresLauncher { Ok(()) } } + +#[cfg(test)] +mod test { + use super::*; + use crate::backend::{pool::Address, ConnectReason, Server, ServerOptions}; + + #[tokio::test] + async fn test_postgres_blue_green() { + crate::logger(); + + let launcher = PostgresLauncher::get(); + sleep(Duration::from_millis(10)).await; + launcher.launch_blue_green(); + launcher + .wait_ready(Duration::from_millis(5000)) + .await + .unwrap(); + let conn = Server::connect( + &Address { + host: "127.0.0.1".into(), + port: 6433, + user: "postgres".into(), + database_name: "postgres".into(), + ..Default::default() + }, + ServerOptions::default(), + ConnectReason::default(), + ) + .await + .unwrap(); + drop(conn); + launcher.launch_blue_green(); + launcher + .wait_ready(Duration::from_millis(5000)) + .await + .unwrap(); + // let conn = Server::connect( + // &Address { + // host: "127.0.0.1".into(), + // port: 6434, + // user: "postgres".into(), + // ..Default::default() + // }, + // ServerOptions::default(), + // ConnectReason::default(), + // ) + // .await + // .unwrap(); + launcher.shutdown(); + launcher + .wait_ready(Duration::from_millis(5000)) + .await + .unwrap(); + // launcher.launch_blue_green(); + // launcher + // .wait_ready(Duration::from_millis(5000)) + // .await + // .unwrap(); + } +} diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index 5f0c0a70..ddd86a6a 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -64,6 +64,15 @@ impl PostgresProcessAsync { Ok(()) } + + /// Force stop immediately. + async fn force_stop(&mut self) -> Result<(), Error> { + self.child.kill().await?; + self.child.wait().await?; + remove_dir_all(&self.initdb_dir).await?; + + Ok(()) + } } #[derive(Debug, Clone)] @@ -134,6 +143,11 @@ impl PostgresProcess { .configure_and_save(self.port, self.version) .await?; + info!( + "[fdw] launching PostgreSQL {} on 0.0.0.0:{}", + self.version, self.port + ); + let child = Command::new(&self.postres) .arg("-D") .arg(&self.initdb_dir) @@ -319,9 +333,9 @@ impl PostgresProcess { let new_database = self.setup_databases(cluster).await?; info!( - "[fdw] setting up {} (new={})", + "[fdw] setting up database={} user={}", cluster.identifier().database, - new_database + cluster.identifier().user, ); let sharding_schema = cluster.sharding_schema(); @@ -430,8 +444,8 @@ impl PostgresProcess { } /// Wait until process is ready and accepting connections. - pub(crate) async fn wait_ready(&self) -> Result<(), Error> { - timeout(Duration::from_millis(5000), self.wait_ready_internal()).await?; + pub(crate) async fn wait_ready(&self, launch_timeout: Duration) -> Result<(), Error> { + timeout(launch_timeout, self.wait_ready_internal()).await?; Ok(()) } @@ -443,11 +457,16 @@ impl PostgresProcess { } } - pub(crate) async fn stop(&mut self) { + pub(crate) async fn stop_wait(&mut self) { self.notify.notify_one(); self.notify.notified().await; self.pid.take(); } + + pub(crate) fn request_stop(&mut self) { + self.notify.notify_one(); + self.pid.take(); + } } impl Drop for PostgresProcess { @@ -493,7 +512,7 @@ mod test { let mut process = PostgresProcess::new(None, 45012).await.unwrap(); process.launch().await.unwrap(); - process.wait_ready().await.unwrap(); + process.wait_ready(Duration::from_secs(5)).await.unwrap(); process.configure(&cluster).await.unwrap(); let mut server = process.admin_connection().await.unwrap(); let backends = server @@ -517,7 +536,7 @@ mod test { .await .unwrap(); - process.stop().await; + process.stop_wait().await; { let mut primary = cluster.primary(0, &Request::default()).await.unwrap(); From b9330849a0ba978390d3da5f2b2c1a5f14ecd380 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Mon, 2 Feb 2026 13:30:14 -0800 Subject: [PATCH 09/29] save --- integration/postgres_fdw/dev.sh | 16 + integration/postgres_fdw/pgdog.toml | 18 ++ integration/postgres_fdw/users.toml | 4 + pgdog/src/backend/databases.rs | 6 + pgdog/src/backend/fdw/launcher.rs | 115 +++---- pgdog/src/backend/fdw/lb.rs | 6 + pgdog/src/backend/fdw/mod.rs | 1 + pgdog/src/backend/fdw/postgres.rs | 44 ++- pgdog/src/backend/fdw/postgres_config.rs | 5 +- .../schema/postgres_fdw/custom_types.rs | 291 ++++++++++++++++++ .../schema/postgres_fdw/custom_types.sql | 85 +++++ .../backend/schema/postgres_fdw/extensions.rs | 125 ++++++++ .../schema/postgres_fdw/extensions.sql | 10 + pgdog/src/backend/schema/postgres_fdw/mod.rs | 4 + .../src/backend/schema/postgres_fdw/schema.rs | 28 +- pgdog/src/frontend/listener.rs | 5 + 16 files changed, 686 insertions(+), 77 deletions(-) create mode 100644 integration/postgres_fdw/dev.sh create mode 100644 integration/postgres_fdw/pgdog.toml create mode 100644 integration/postgres_fdw/users.toml create mode 100644 pgdog/src/backend/fdw/lb.rs create mode 100644 pgdog/src/backend/schema/postgres_fdw/custom_types.rs create mode 100644 pgdog/src/backend/schema/postgres_fdw/custom_types.sql create mode 100644 pgdog/src/backend/schema/postgres_fdw/extensions.rs create mode 100644 pgdog/src/backend/schema/postgres_fdw/extensions.sql diff --git a/integration/postgres_fdw/dev.sh b/integration/postgres_fdw/dev.sh new file mode 100644 index 00000000..49ff2020 --- /dev/null +++ b/integration/postgres_fdw/dev.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -ex -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PGDOG="$SCRIPT_DIR/../../target/debug/pgdog" + +dropdb shard_0_fdw || true +dropdb shard_1_fdw || true + +createdb shard_0_fdw +createdb shard_1_fdw + +psql -f "$SCRIPT_DIR/../schema_sync/ecommerce_schema.sql" shard_0_fdw +psql -f "$SCRIPT_DIR/../schema_sync/ecommerce_schema.sql" shard_1_fdw + +${PGDOG} diff --git a/integration/postgres_fdw/pgdog.toml b/integration/postgres_fdw/pgdog.toml new file mode 100644 index 00000000..d9cd431e --- /dev/null +++ b/integration/postgres_fdw/pgdog.toml @@ -0,0 +1,18 @@ +[fdw] +enabled = true + +[[databases]] +name = "pgdog" +shard = 0 +host = "127.0.0.1" +database_name = "shard_0_fdw" + +[[databases]] +name = "pgdog" +shard = 1 +host = "127.0.0.1" +database_name = "shard_1_fdw" + +[[sharded_tables]] +column = "user_id" +database = "pgdog" diff --git a/integration/postgres_fdw/users.toml b/integration/postgres_fdw/users.toml new file mode 100644 index 00000000..539bb183 --- /dev/null +++ b/integration/postgres_fdw/users.toml @@ -0,0 +1,4 @@ +[[users]] +name = "pgdog" +password = "pgdog" +database = "pgdog" diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index d8028c35..8683b02e 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -9,6 +9,7 @@ use parking_lot::lock_api::MutexGuard; use parking_lot::{Mutex, RawMutex}; use tracing::{debug, error, info, warn}; +use crate::backend::fdw::PostgresLauncher; use crate::backend::replication::ShardedSchemas; use crate::config::PoolerMode; use crate::frontend::client::query_engine::two_pc::Manager; @@ -96,6 +97,11 @@ pub fn init() -> Result<(), Error> { // Start two-pc manager. let _monitor = Manager::get(); + // Start postgres_fdw compatibility engine. + if config.config.fdw.enabled { + PostgresLauncher::get().launch(); + } + Ok(()) } diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs index a26e724d..f7742736 100644 --- a/pgdog/src/backend/fdw/launcher.rs +++ b/pgdog/src/backend/fdw/launcher.rs @@ -6,14 +6,14 @@ use std::{ }, }; -use crate::config::config; +use crate::{backend::databases::databases, config::config}; use super::{Error, PostgresProcess}; use once_cell::sync::Lazy; use tokio::{ select, spawn, sync::Notify, - time::{sleep, timeout, Duration}, + time::{sleep, Duration}, }; use tracing::{error, info}; @@ -63,6 +63,19 @@ impl PostgresLauncher { LAUNCHER.clone() } + /// Start the launcher. + /// + /// Idempontent. + pub(crate) fn launch(&self) { + let online = self.online.load(Ordering::Relaxed); + + if online { + return; + } + + self.launch_blue_green(); + } + fn spawn(&self) { let launcher = self.clone(); @@ -73,7 +86,12 @@ impl PostgresLauncher { launcher.restart.notified().await; } - launcher.online.store(true, Ordering::Relaxed); + let online = launcher.online.load(Ordering::Relaxed); + + if !online { + launcher.ready_signal.notify_waiters(); + return; + } loop { info!( @@ -101,6 +119,11 @@ impl PostgresLauncher { self.restart.notify_waiters(); } + pub(crate) async fn shutdown_wait(&self) { + self.shutdown(); + self.wait_ready().await + } + /// Trigger blue/green deployment. pub(crate) fn launch_blue_green(&self) { let fdw = config().config.fdw; @@ -113,27 +136,26 @@ impl PostgresLauncher { self.port.store(port, Ordering::Relaxed); self.ready.store(false, Ordering::Relaxed); + self.online.store(true, Ordering::Relaxed); self.restart.notify_waiters(); } /// Wait for Postgres to be ready. - pub(crate) async fn wait_ready(&self, launch_timeout: Duration) -> Result<(), Error> { + pub(crate) async fn wait_ready(&self) { let ready = self.ready.load(Ordering::Relaxed); if ready { - return Ok(()); + return; } let waiter = self.ready_signal.notified(); let ready = self.ready.load(Ordering::Relaxed); if ready { - return Ok(()); + return; } - timeout(launch_timeout, waiter).await?; - - Ok(()) + waiter.await; } fn mark_ready(&self) { @@ -147,7 +169,13 @@ impl PostgresLauncher { let waiter = process.notify(); process.launch().await?; - process.wait_ready(Duration::MAX).await?; + process.wait_ready().await; + + for cluster in databases().all().values() { + if cluster.shards().len() > 1 { + process.configure(cluster).await?; + } + } self.mark_ready(); @@ -155,10 +183,8 @@ impl PostgresLauncher { _ = self.restart.notified() => { let online = self.online.load(Ordering::Relaxed); if online { - println!("requesting stop"); process.request_stop(); } else { - println!("waiting for stop"); process.stop_wait().await; self.mark_ready(); } @@ -181,54 +207,33 @@ mod test { #[tokio::test] async fn test_postgres_blue_green() { crate::logger(); + let mut address = Address { + host: "127.0.0.1".into(), + port: 6433, + user: "postgres".into(), + database_name: "postgres".into(), + ..Default::default() + }; let launcher = PostgresLauncher::get(); - sleep(Duration::from_millis(10)).await; launcher.launch_blue_green(); - launcher - .wait_ready(Duration::from_millis(5000)) - .await - .unwrap(); - let conn = Server::connect( - &Address { - host: "127.0.0.1".into(), - port: 6433, - user: "postgres".into(), - database_name: "postgres".into(), - ..Default::default() - }, - ServerOptions::default(), - ConnectReason::default(), - ) - .await - .unwrap(); + launcher.wait_ready().await; + let mut conn = + Server::connect(&address, ServerOptions::default(), ConnectReason::default()) + .await + .unwrap(); + conn.execute("SELECT 1").await.unwrap(); drop(conn); launcher.launch_blue_green(); - launcher - .wait_ready(Duration::from_millis(5000)) - .await - .unwrap(); - // let conn = Server::connect( - // &Address { - // host: "127.0.0.1".into(), - // port: 6434, - // user: "postgres".into(), - // ..Default::default() - // }, - // ServerOptions::default(), - // ConnectReason::default(), - // ) - // .await - // .unwrap(); + launcher.wait_ready().await; + + address.port = 6434; + let mut conn = + Server::connect(&address, ServerOptions::default(), ConnectReason::default()) + .await + .unwrap(); + conn.execute("SELECT 1").await.unwrap(); launcher.shutdown(); - launcher - .wait_ready(Duration::from_millis(5000)) - .await - .unwrap(); - // launcher.launch_blue_green(); - // launcher - // .wait_ready(Duration::from_millis(5000)) - // .await - // .unwrap(); + launcher.wait_ready().await; } } diff --git a/pgdog/src/backend/fdw/lb.rs b/pgdog/src/backend/fdw/lb.rs new file mode 100644 index 00000000..c1396c65 --- /dev/null +++ b/pgdog/src/backend/fdw/lb.rs @@ -0,0 +1,6 @@ +use crate::backend::Pool; + +pub(crate) struct FdwLoadBalancer { + primary: Option, + // replicas: +} diff --git a/pgdog/src/backend/fdw/mod.rs b/pgdog/src/backend/fdw/mod.rs index 1709aafb..c948f512 100644 --- a/pgdog/src/backend/fdw/mod.rs +++ b/pgdog/src/backend/fdw/mod.rs @@ -1,6 +1,7 @@ pub mod bins; pub mod error; pub mod launcher; +pub mod lb; pub mod postgres; pub mod postgres_config; diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index ddd86a6a..90166b20 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -23,7 +23,7 @@ use tokio::{ process::{Child, Command}, select, spawn, sync::Notify, - time::{sleep, timeout}, + time::{sleep, Instant}, }; use tracing::{error, info, warn}; @@ -42,11 +42,18 @@ struct PostgresProcessAsync { child: Child, initdb_dir: PathBuf, notify: Arc, + version: f32, + port: u16, } impl PostgresProcessAsync { /// Stop Postgres and cleanup. async fn stop(&mut self) -> Result<(), Error> { + warn!( + "[fdw] stopping PostgreSQL {} running on 0.0.0.0:{}", + self.version, self.port + ); + #[cfg(unix)] { let pid = self.child.id().expect("child has no pid") as i32; @@ -163,6 +170,8 @@ impl PostgresProcess { child, notify: self.notify.clone(), initdb_dir: self.initdb_dir.clone(), + port: self.port, + version: self.version, }; spawn(async move { @@ -254,24 +263,17 @@ impl PostgresProcess { cluster: &Cluster, shard: usize, ) -> Result, Error> { - let mut replica = 0; - let shard = cluster .shards() .get(shard) .ok_or(Error::ShardsHostsMismatch)?; Ok(shard - .pools_with_roles() + .pools() .iter() - .map(|(role, pool)| { - let database = match role { - Role::Primary => format!("{}_p", cluster.identifier().database), - _ => { - replica += 1; - format!("{}_r{}", cluster.identifier().database, replica) - } - }; + .enumerate() + .map(|(number, pool)| { + let database = format!("{}_{}", cluster.identifier().database, number); (database, pool.addr().clone()) }) .collect()) @@ -331,6 +333,7 @@ impl PostgresProcess { /// for this cluster. pub(crate) async fn configure(&mut self, cluster: &Cluster) -> Result<(), Error> { let new_database = self.setup_databases(cluster).await?; + let now = Instant::now(); info!( "[fdw] setting up database={} user={}", @@ -415,6 +418,15 @@ impl PostgresProcess { } } + let elapsed = now.elapsed(); + + info!( + "[fdw] setup complete for database={} user={} in {:.3}ms", + cluster.identifier().database, + cluster.identifier().user, + elapsed.as_secs_f32() * 1000.0, + ); + Ok(()) } @@ -444,10 +456,8 @@ impl PostgresProcess { } /// Wait until process is ready and accepting connections. - pub(crate) async fn wait_ready(&self, launch_timeout: Duration) -> Result<(), Error> { - timeout(launch_timeout, self.wait_ready_internal()).await?; - - Ok(()) + pub(crate) async fn wait_ready(&self) { + self.wait_ready_internal().await; } async fn wait_ready_internal(&self) { @@ -512,7 +522,7 @@ mod test { let mut process = PostgresProcess::new(None, 45012).await.unwrap(); process.launch().await.unwrap(); - process.wait_ready(Duration::from_secs(5)).await.unwrap(); + process.wait_ready().await; process.configure(&cluster).await.unwrap(); let mut server = process.admin_connection().await.unwrap(); let backends = server diff --git a/pgdog/src/backend/fdw/postgres_config.rs b/pgdog/src/backend/fdw/postgres_config.rs index ac1ee1e5..bc7a4247 100644 --- a/pgdog/src/backend/fdw/postgres_config.rs +++ b/pgdog/src/backend/fdw/postgres_config.rs @@ -1,8 +1,5 @@ use std::path::{Path, PathBuf}; -use tokio::{ - fs::{read_to_string, File}, - io::AsyncWriteExt, -}; +use tokio::{fs::File, io::AsyncWriteExt}; use super::Error; diff --git a/pgdog/src/backend/schema/postgres_fdw/custom_types.rs b/pgdog/src/backend/schema/postgres_fdw/custom_types.rs new file mode 100644 index 00000000..13eab57c --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/custom_types.rs @@ -0,0 +1,291 @@ +//! Custom type definitions (enums, domains, composite types) for foreign tables. + +use std::collections::HashSet; +use std::fmt::Write; + +use crate::net::messages::DataRow; + +use super::quote_identifier; + +/// Query to fetch custom type definitions. +pub static CUSTOM_TYPES_QUERY: &str = include_str!("custom_types.sql"); + +/// Kind of custom type. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CustomTypeKind { + Enum, + Domain, + Composite, +} + +impl CustomTypeKind { + fn from_str(s: &str) -> Option { + match s { + "enum" => Some(Self::Enum), + "domain" => Some(Self::Domain), + "composite" => Some(Self::Composite), + _ => None, + } + } +} + +/// A custom type definition from the database. +#[derive(Debug, Clone)] +pub struct CustomType { + pub kind: CustomTypeKind, + pub schema_name: String, + pub type_name: String, + /// Base type for domains. + pub base_type: String, + /// Constraint definition for domains (e.g., "CHECK (VALUE > 0)"). + pub constraint_def: String, + /// Default value for domains. + pub default_value: String, + /// Collation name. + pub collation_name: String, + /// Collation schema. + pub collation_schema: String, + /// Comma-separated enum labels. + pub enum_labels: String, + /// Comma-separated composite attributes (name type pairs). + pub composite_attributes: String, +} + +impl From for CustomType { + fn from(value: DataRow) -> Self { + let kind_str = value.get_text(0).unwrap_or_default(); + Self { + kind: CustomTypeKind::from_str(&kind_str).unwrap_or(CustomTypeKind::Enum), + schema_name: value.get_text(1).unwrap_or_default(), + type_name: value.get_text(2).unwrap_or_default(), + base_type: value.get_text(3).unwrap_or_default(), + constraint_def: value.get_text(4).unwrap_or_default(), + default_value: value.get_text(5).unwrap_or_default(), + collation_name: value.get_text(6).unwrap_or_default(), + collation_schema: value.get_text(7).unwrap_or_default(), + enum_labels: value.get_text(8).unwrap_or_default(), + composite_attributes: value.get_text(9).unwrap_or_default(), + } + } +} + +impl CustomType { + /// Fully qualified type name. + pub fn qualified_name(&self) -> String { + format!( + "{}.{}", + quote_identifier(&self.schema_name), + quote_identifier(&self.type_name) + ) + } + + /// Generate the CREATE statement for this type. + pub fn create_statement(&self) -> Result { + match self.kind { + CustomTypeKind::Enum => self.create_enum_statement(), + CustomTypeKind::Domain => self.create_domain_statement(), + CustomTypeKind::Composite => self.create_composite_statement(), + } + } + + fn create_enum_statement(&self) -> Result { + let mut sql = String::new(); + write!(sql, "CREATE TYPE {} AS ENUM (", self.qualified_name())?; + + let labels: Vec<&str> = self.enum_labels.split(',').collect(); + for (i, label) in labels.iter().enumerate() { + if i > 0 { + sql.push_str(", "); + } + write!(sql, "'{}'", label.replace('\'', "''"))?; + } + + sql.push(')'); + Ok(sql) + } + + fn create_domain_statement(&self) -> Result { + let mut sql = String::new(); + write!( + sql, + "CREATE DOMAIN {} AS {}", + self.qualified_name(), + self.base_type + )?; + + if self.has_collation() { + write!( + sql, + " COLLATE {}.{}", + quote_identifier(&self.collation_schema), + quote_identifier(&self.collation_name) + )?; + } + + if !self.default_value.is_empty() { + write!(sql, " DEFAULT {}", self.default_value)?; + } + + if !self.constraint_def.is_empty() { + write!(sql, " {}", self.constraint_def)?; + } + + Ok(sql) + } + + fn create_composite_statement(&self) -> Result { + let mut sql = String::new(); + write!(sql, "CREATE TYPE {} AS (", self.qualified_name())?; + + // Split on newlines since type definitions can contain commas + let attrs: Vec<&str> = self.composite_attributes.split('\n').collect(); + for (i, attr) in attrs.iter().enumerate() { + if i > 0 { + sql.push_str(", "); + } + let attr = attr.trim(); + if let Some((name, typ)) = attr.split_once(' ') { + write!(sql, "{} {}", quote_identifier(name), typ)?; + } else { + sql.push_str(attr); + } + } + + sql.push(')'); + Ok(sql) + } + + fn has_collation(&self) -> bool { + !self.collation_name.is_empty() && !self.collation_schema.is_empty() + } +} + +/// Collection of custom types from a database. +#[derive(Debug, Clone, Default)] +pub struct CustomTypes { + types: Vec, +} + +impl CustomTypes { + /// Load custom types from a server. + pub(crate) async fn load( + server: &mut crate::backend::Server, + ) -> Result { + let types: Vec = server.fetch_all(CUSTOM_TYPES_QUERY).await?; + Ok(Self { types }) + } + + /// Create all custom types on the target server. + pub(crate) async fn setup( + &self, + server: &mut crate::backend::Server, + ) -> Result<(), crate::backend::Error> { + let mut created_schemas = HashSet::new(); + + for custom_type in &self.types { + if !created_schemas.contains(&custom_type.schema_name) { + server + .execute(&format!( + "CREATE SCHEMA IF NOT EXISTS {}", + quote_identifier(&custom_type.schema_name) + )) + .await?; + created_schemas.insert(custom_type.schema_name.clone()); + } + + let stmt = custom_type.create_statement()?; + + tracing::debug!("[fdw::setup] {} [{}]", stmt, server.addr()); + server.execute(&stmt).await?; + } + + Ok(()) + } + + /// Get the types. + pub fn types(&self) -> &[CustomType] { + &self.types + } + + /// Check if there are any custom types. + pub fn is_empty(&self) -> bool { + self.types.is_empty() + } +} + +#[cfg(test)] +mod test { + use super::*; + + fn test_enum() -> CustomType { + CustomType { + kind: CustomTypeKind::Enum, + schema_name: "core".into(), + type_name: "user_status".into(), + base_type: String::new(), + constraint_def: String::new(), + default_value: String::new(), + collation_name: String::new(), + collation_schema: String::new(), + enum_labels: "active,inactive,suspended".into(), + composite_attributes: String::new(), + } + } + + fn test_domain() -> CustomType { + CustomType { + kind: CustomTypeKind::Domain, + schema_name: "core".into(), + type_name: "email".into(), + base_type: "character varying(255)".into(), + constraint_def: "CHECK ((VALUE)::text ~ '^[A-Za-z0-9._%+-]+@'::text)".into(), + default_value: String::new(), + collation_name: String::new(), + collation_schema: String::new(), + enum_labels: String::new(), + composite_attributes: String::new(), + } + } + + fn test_composite() -> CustomType { + CustomType { + kind: CustomTypeKind::Composite, + schema_name: "core".into(), + type_name: "geo_point".into(), + base_type: String::new(), + constraint_def: String::new(), + default_value: String::new(), + collation_name: String::new(), + collation_schema: String::new(), + enum_labels: String::new(), + composite_attributes: "latitude numeric(9,6)\nlongitude numeric(9,6)".into(), + } + } + + #[test] + fn test_create_enum_statement() { + let t = test_enum(); + let sql = t.create_statement().unwrap(); + assert_eq!( + sql, + r#"CREATE TYPE "core"."user_status" AS ENUM ('active', 'inactive', 'suspended')"# + ); + } + + #[test] + fn test_create_domain_statement() { + let t = test_domain(); + let sql = t.create_statement().unwrap(); + assert!(sql.contains(r#"CREATE DOMAIN "core"."email" AS character varying(255)"#)); + assert!(sql.contains("CHECK")); + } + + #[test] + fn test_create_composite_statement() { + let t = test_composite(); + let sql = t.create_statement().unwrap(); + assert!(sql.contains(r#"CREATE TYPE "core"."geo_point" AS ("#)); + assert!(sql.contains(r#""latitude" numeric(9,6)"#)); + assert!(sql.contains(r#""longitude" numeric(9,6)"#)); + } +} diff --git a/pgdog/src/backend/schema/postgres_fdw/custom_types.sql b/pgdog/src/backend/schema/postgres_fdw/custom_types.sql new file mode 100644 index 00000000..95360cce --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/custom_types.sql @@ -0,0 +1,85 @@ +-- Query to fetch custom type definitions (enums, domains, composite types) +-- for recreation on FDW server before creating foreign tables. + +-- Enums: type info and values +SELECT + 'enum' AS type_kind, + n.nspname::text AS schema_name, + t.typname::text AS type_name, + NULL::text AS base_type, + NULL::text AS constraint_def, + NULL::text AS default_value, + NULL::text AS collation_name, + NULL::text AS collation_schema, + string_agg(e.enumlabel::text, ',' ORDER BY e.enumsortorder)::text AS enum_labels, + NULL::text AS composite_attributes +FROM pg_catalog.pg_type t +JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid +JOIN pg_catalog.pg_enum e ON e.enumtypid = t.oid +WHERE t.typtype = 'e' + AND n.nspname <> 'pg_catalog' + AND n.nspname !~ '^pg_toast' + AND n.nspname <> 'information_schema' +GROUP BY n.nspname, t.typname + +UNION ALL + +-- Domains: base type, constraints, defaults, collation +SELECT + 'domain' AS type_kind, + n.nspname::text AS schema_name, + t.typname::text AS type_name, + pg_catalog.format_type(t.typbasetype, t.typtypmod)::text AS base_type, + ( + SELECT string_agg(pg_catalog.pg_get_constraintdef(c.oid, true), ' ' ORDER BY c.conname) + FROM pg_catalog.pg_constraint c + WHERE c.contypid = t.oid + )::text AS constraint_def, + t.typdefault::text AS default_value, + coll.collname::text AS collation_name, + collnsp.nspname::text AS collation_schema, + NULL::text AS enum_labels, + NULL::text AS composite_attributes +FROM pg_catalog.pg_type t +JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid +LEFT JOIN pg_catalog.pg_collation coll ON coll.oid = t.typcollation +LEFT JOIN pg_catalog.pg_namespace collnsp ON collnsp.oid = coll.collnamespace +WHERE t.typtype = 'd' + AND n.nspname <> 'pg_catalog' + AND n.nspname !~ '^pg_toast' + AND n.nspname <> 'information_schema' + +UNION ALL + +-- Composite types (excluding table row types) +-- Uses newline as separator since type definitions can contain commas +SELECT + 'composite' AS type_kind, + n.nspname::text AS schema_name, + t.typname::text AS type_name, + NULL::text AS base_type, + NULL::text AS constraint_def, + NULL::text AS default_value, + NULL::text AS collation_name, + NULL::text AS collation_schema, + NULL::text AS enum_labels, + ( + SELECT string_agg( + a.attname || ' ' || pg_catalog.format_type(a.atttypid, a.atttypmod), + E'\n' ORDER BY a.attnum + ) + FROM pg_catalog.pg_attribute a + WHERE a.attrelid = t.typrelid + AND a.attnum > 0 + AND NOT a.attisdropped + )::text AS composite_attributes +FROM pg_catalog.pg_type t +JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid +JOIN pg_catalog.pg_class c ON c.oid = t.typrelid +WHERE t.typtype = 'c' + AND c.relkind = 'c' -- Only standalone composite types, not table row types + AND n.nspname <> 'pg_catalog' + AND n.nspname !~ '^pg_toast' + AND n.nspname <> 'information_schema' + +ORDER BY type_kind, schema_name, type_name diff --git a/pgdog/src/backend/schema/postgres_fdw/extensions.rs b/pgdog/src/backend/schema/postgres_fdw/extensions.rs new file mode 100644 index 00000000..61a9cc50 --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/extensions.rs @@ -0,0 +1,125 @@ +//! Extension definitions for foreign tables. + +use std::fmt::Write; + +use crate::net::messages::DataRow; + +use super::quote_identifier; + +/// Query to fetch installed extensions. +pub static EXTENSIONS_QUERY: &str = include_str!("extensions.sql"); + +/// An installed extension. +#[derive(Debug, Clone)] +pub struct Extension { + pub name: String, + pub schema_name: String, + pub version: String, +} + +impl From for Extension { + fn from(value: DataRow) -> Self { + Self { + name: value.get_text(0).unwrap_or_default(), + schema_name: value.get_text(1).unwrap_or_default(), + version: value.get_text(2).unwrap_or_default(), + } + } +} + +impl Extension { + /// Generate the CREATE EXTENSION statement. + pub fn create_statement(&self) -> Result { + let mut sql = String::new(); + write!( + sql, + "CREATE EXTENSION IF NOT EXISTS {}", + quote_identifier(&self.name) + )?; + + // Only specify schema if it's not the default 'public' + if !self.schema_name.is_empty() && self.schema_name != "public" { + write!(sql, " SCHEMA {}", quote_identifier(&self.schema_name))?; + } + + Ok(sql) + } +} + +/// Collection of extensions from a database. +#[derive(Debug, Clone, Default)] +pub struct Extensions { + extensions: Vec, +} + +impl Extensions { + /// Load extensions from a server. + pub(crate) async fn load( + server: &mut crate::backend::Server, + ) -> Result { + let extensions: Vec = server.fetch_all(EXTENSIONS_QUERY).await?; + Ok(Self { extensions }) + } + + /// Create all extensions on the target server. + pub(crate) async fn setup( + &self, + server: &mut crate::backend::Server, + ) -> Result<(), crate::backend::Error> { + for ext in &self.extensions { + let stmt = ext.create_statement()?; + tracing::debug!("[fdw::setup] {} [{}]", stmt, server.addr()); + server.execute(&stmt).await?; + } + + Ok(()) + } + + /// Get the extensions. + pub fn extensions(&self) -> &[Extension] { + &self.extensions + } + + /// Check if there are any extensions. + pub fn is_empty(&self) -> bool { + self.extensions.is_empty() + } +} + +#[cfg(test)] +mod test { + use super::*; + + fn test_extension() -> Extension { + Extension { + name: "ltree".into(), + schema_name: "public".into(), + version: "1.2".into(), + } + } + + fn test_extension_with_schema() -> Extension { + Extension { + name: "pg_trgm".into(), + schema_name: "extensions".into(), + version: "1.6".into(), + } + } + + #[test] + fn test_create_extension_statement() { + let ext = test_extension(); + let sql = ext.create_statement().unwrap(); + assert_eq!(sql, r#"CREATE EXTENSION IF NOT EXISTS "ltree""#); + } + + #[test] + fn test_create_extension_statement_with_schema() { + let ext = test_extension_with_schema(); + let sql = ext.create_statement().unwrap(); + assert_eq!( + sql, + r#"CREATE EXTENSION IF NOT EXISTS "pg_trgm" SCHEMA "extensions""# + ); + } +} diff --git a/pgdog/src/backend/schema/postgres_fdw/extensions.sql b/pgdog/src/backend/schema/postgres_fdw/extensions.sql new file mode 100644 index 00000000..db87a2d2 --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/extensions.sql @@ -0,0 +1,10 @@ +-- Query to fetch installed extensions for recreation on FDW server. +-- Excludes built-in extensions that are always available. +SELECT + e.extname::text AS extension_name, + n.nspname::text AS schema_name, + e.extversion::text AS version +FROM pg_catalog.pg_extension e +JOIN pg_catalog.pg_namespace n ON e.extnamespace = n.oid +WHERE e.extname NOT IN ('plpgsql') -- Exclude always-installed extensions +ORDER BY e.extname diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs index 8b25b89e..2673d0e6 100644 --- a/pgdog/src/backend/schema/postgres_fdw/mod.rs +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -1,10 +1,14 @@ //! Schema information for creating foreign tables via postgres_fdw. +mod custom_types; mod error; +mod extensions; mod schema; mod statement; +pub use custom_types::{CustomType, CustomTypeKind, CustomTypes, CUSTOM_TYPES_QUERY}; pub use error::Error; +pub use extensions::{Extension, Extensions, EXTENSIONS_QUERY}; pub use schema::{ForeignTableColumn, ForeignTableSchema, FOREIGN_TABLE_SCHEMA}; pub use statement::{create_foreign_table, ForeignTableBuilder, PartitionStrategy}; diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 170751eb..2f92a77f 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -8,6 +8,9 @@ use crate::{ net::messages::DataRow, }; +use super::custom_types::CustomTypes; +use super::extensions::Extensions; + /// Query to fetch table and column information needed for CREATE FOREIGN TABLE statements. pub static FOREIGN_TABLE_SCHEMA: &str = include_str!("postgres_fdw.sql"); @@ -49,12 +52,19 @@ impl From for ForeignTableColumn { #[derive(Debug, Clone)] pub struct ForeignTableSchema { tables: HashMap<(String, String), Vec>, + extensions: Extensions, + custom_types: CustomTypes, } impl ForeignTableSchema { pub(crate) async fn load(server: &mut Server) -> Result { + let tables = ForeignTableColumn::load(server).await?; + let extensions = Extensions::load(server).await?; + let custom_types = CustomTypes::load(server).await?; Ok(Self { - tables: ForeignTableColumn::load(server).await?, + tables, + extensions, + custom_types, }) } @@ -63,6 +73,12 @@ impl ForeignTableSchema { server: &mut Server, sharding_schema: &ShardingSchema, ) -> Result<(), super::super::Error> { + // Create extensions first (types may depend on them) + self.extensions.setup(server).await?; + + // Create custom types (enums, domains, composite types) + self.custom_types.setup(server).await?; + let mut schemas = HashSet::new(); let mut tables = HashSet::new(); @@ -89,6 +105,16 @@ impl ForeignTableSchema { } Ok(()) } + + /// Get the extensions. + pub fn extensions(&self) -> &Extensions { + &self.extensions + } + + /// Get the custom types. + pub fn custom_types(&self) -> &CustomTypes { + &self.custom_types + } } impl ForeignTableColumn { diff --git a/pgdog/src/frontend/listener.rs b/pgdog/src/frontend/listener.rs index 250ec653..1fc92946 100644 --- a/pgdog/src/frontend/listener.rs +++ b/pgdog/src/frontend/listener.rs @@ -5,6 +5,7 @@ use std::net::SocketAddr; use std::sync::Arc; use crate::backend::databases::{databases, reload, shutdown}; +use crate::backend::fdw::PostgresLauncher; use crate::config::config; use crate::frontend::client::query_engine::two_pc::Manager; use crate::net::messages::BackendKeyData; @@ -151,6 +152,10 @@ impl Listener { } self.shutdown.notify_waiters(); + + if let Err(_) = timeout(shutdown_timeout, PostgresLauncher::get().shutdown_wait()).await { + error!("[fdw] graceful shutdown failed"); + } } async fn handle_client(stream: TcpStream, addr: SocketAddr) -> Result<(), Error> { From d89c921811817e5a583e293a115034654bd1c8e7 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Mon, 2 Feb 2026 14:15:37 -0800 Subject: [PATCH 10/29] save --- pgdog/src/backend/fdw/lb.rs | 22 +++++++++++++++++++--- pgdog/src/backend/fdw/postgres.rs | 5 ++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pgdog/src/backend/fdw/lb.rs b/pgdog/src/backend/fdw/lb.rs index c1396c65..36c9c240 100644 --- a/pgdog/src/backend/fdw/lb.rs +++ b/pgdog/src/backend/fdw/lb.rs @@ -1,6 +1,22 @@ -use crate::backend::Pool; +use crate::backend::{Cluster, LoadBalancer, Pool}; + +use super::Error; +use super::PostgresProcess; pub(crate) struct FdwLoadBalancer { - primary: Option, - // replicas: + lb: LoadBalancer, +} + +impl FdwLoadBalancer { + pub(crate) fn new(cluster: &Cluster) -> Result { + // We check that all shards have identical configs + // before enabling this feature. + let pools = PostgresProcess::pools_to_databases(cluster, 0)?; + let pools = cluster + .shards() + .get(0) + .ok_or(Error::ShardsHostsMismatch)? + .pools_with_roles(); + todo!() + } } diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index 90166b20..ab640ae6 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -259,7 +259,7 @@ impl PostgresProcess { .collect()) } - fn pools_to_databases( + pub(super) fn pools_to_databases( cluster: &Cluster, shard: usize, ) -> Result, Error> { @@ -284,13 +284,12 @@ impl PostgresProcess { .shards() .iter() .map(|shard| { - let mut roles: Vec<_> = shard + let roles: Vec<_> = shard .pools_with_roles() .iter() .map(|(role, _)| role) .cloned() .collect(); - roles.sort(); roles }) .collect(); From 0558f05916e34381254308df5de8b0b4e089fc4b Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Tue, 3 Feb 2026 17:38:53 -0800 Subject: [PATCH 11/29] save --- pgdog/src/backend/databases.rs | 11 +- pgdog/src/backend/fdw/launcher.rs | 285 +++++++++++++----- pgdog/src/backend/fdw/mod.rs | 1 + pgdog/src/backend/fdw/postgres.rs | 166 ++++++---- pgdog/src/backend/pool/connection/mod.rs | 4 +- pgdog/src/backend/pool/error.rs | 3 + .../frontend/client/query_engine/connect.rs | 10 + .../client/query_engine/route_query.rs | 1 + pgdog/src/frontend/listener.rs | 4 +- pgdog/src/frontend/router/parser/route.rs | 22 ++ pgdog/src/net/parameter.rs | 6 + 11 files changed, 368 insertions(+), 145 deletions(-) diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index 8683b02e..23ecf679 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -89,6 +89,12 @@ pub fn reload_from_existing() -> Result<(), Error> { /// Initialize the databases for the first time. pub fn init() -> Result<(), Error> { let config = config(); + + // Start postgres_fdw compatibility engine. + if config.config.fdw.enabled { + PostgresLauncher::get().launch(); + } + replace_databases(from_config(&config), false)?; // Resize query cache @@ -97,11 +103,6 @@ pub fn init() -> Result<(), Error> { // Start two-pc manager. let _monitor = Manager::get(); - // Start postgres_fdw compatibility engine. - if config.config.fdw.enabled { - PostgresLauncher::get().launch(); - } - Ok(()) } diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs index f7742736..823d3637 100644 --- a/pgdog/src/backend/fdw/launcher.rs +++ b/pgdog/src/backend/fdw/launcher.rs @@ -12,12 +12,12 @@ use super::{Error, PostgresProcess}; use once_cell::sync::Lazy; use tokio::{ select, spawn, - sync::Notify, + sync::watch, time::{sleep, Duration}, }; -use tracing::{error, info}; +use tracing::{error, info, warn}; -const LAUNCHER: Lazy = Lazy::new(PostgresLauncher::new); +static LAUNCHER: Lazy = Lazy::new(PostgresLauncher::new); #[derive(Debug, Clone)] pub struct PostgresLauncher { @@ -32,13 +32,20 @@ impl Deref for PostgresLauncher { } } -#[derive(Debug, Default)] +#[derive(Debug)] pub struct Inner { - restart: Notify, + /// Incremented to trigger restart/shutdown check in the spawn loop. + restart_trigger: watch::Sender, + /// Whether the launcher should be running. online: AtomicBool, + /// True when shutdown has been requested (used to detect early shutdown). + shutdown_requested: AtomicBool, + /// Current blue/green port. port: AtomicU16, - ready_signal: Notify, - ready: AtomicBool, + /// True when postgres is ready to accept connections. + ready: watch::Sender, + /// True when shutdown is complete. + shutdown_complete: watch::Sender, } impl PostgresLauncher { @@ -46,16 +53,23 @@ impl PostgresLauncher { let fdw = config().config.fdw; let port = AtomicU16::new(fdw.blue_port); - let laucher = Self { + let (restart_trigger, _) = watch::channel(0u64); + let (ready, _) = watch::channel(false); + let (shutdown_complete, _) = watch::channel(false); + + let launcher = Self { inner: Arc::new(Inner { + restart_trigger, + online: AtomicBool::new(false), + shutdown_requested: AtomicBool::new(false), port, - ..Default::default() + ready, + shutdown_complete, }), }; - laucher.spawn(); - - laucher + launcher.spawn(); + launcher } /// Get the launcher singleton instance. @@ -63,139 +77,202 @@ impl PostgresLauncher { LAUNCHER.clone() } + /// Get current blue/green port. + pub(crate) fn port(&self) -> u16 { + self.port.load(Ordering::Relaxed) + } + /// Start the launcher. /// /// Idempontent. pub(crate) fn launch(&self) { - let online = self.online.load(Ordering::Relaxed); - - if online { + if self.online.load(Ordering::Relaxed) { return; } - self.launch_blue_green(); + self.launch_blue_green(false); } fn spawn(&self) { let launcher = self.clone(); spawn(async move { - let online = launcher.online.load(Ordering::Relaxed); - - if !online { - launcher.restart.notified().await; - } + let mut restart_receiver = launcher.restart_trigger.subscribe(); - let online = launcher.online.load(Ordering::Relaxed); + loop { + let online = launcher.online.load(Ordering::Relaxed); - if !online { - launcher.ready_signal.notify_waiters(); - return; - } + if !online { + // Check if shutdown was already requested before we even started. + if launcher.shutdown_requested.load(Ordering::Acquire) { + launcher.ready.send_modify(|v| *v = true); + launcher.mark_shutdown(); + return; + } + + // Wait for trigger (launch or shutdown). + let _ = restart_receiver.changed().await; + + // Re-check if this was a shutdown request. + if launcher.shutdown_requested.load(Ordering::Acquire) { + launcher.ready.send_modify(|v| *v = true); + launcher.mark_shutdown(); + return; + } + } - loop { info!( "[fdw] launching fdw backend on 0.0.0.0:{}", launcher.port.load(Ordering::Relaxed), ); - if let Err(err) = launcher.run().await { - error!("[fdw] launcher exited with error: {}", err); + let had_error = launcher.run(&mut restart_receiver).await.is_err(); + if had_error { + error!("[fdw] launcher exited with error"); } let online = launcher.online.load(Ordering::Relaxed); if !online { break; - } else { + } + + // Delay retry only on error to prevent tight loops. + if had_error { sleep(Duration::from_millis(1000)).await; } } + + // Signal shutdown complete when exiting the loop. + launcher.mark_shutdown(); }); } pub(crate) fn shutdown(&self) { + self.shutdown_requested.store(true, Ordering::Release); self.online.store(false, Ordering::Relaxed); - self.ready.store(false, Ordering::Relaxed); - self.restart.notify_waiters(); + self.ready.send_modify(|v| *v = false); + self.shutdown_complete.send_modify(|v| *v = false); + self.restart_trigger.send_modify(|v| *v = v.wrapping_add(1)); } pub(crate) async fn shutdown_wait(&self) { self.shutdown(); - self.wait_ready().await + self.wait_shutdown().await; } /// Trigger blue/green deployment. - pub(crate) fn launch_blue_green(&self) { + pub(crate) fn launch_blue_green(&self, failover: bool) { let fdw = config().config.fdw; let port = self.port.load(Ordering::Relaxed); - let port = if port == fdw.blue_port { - fdw.green_port + + let port = if failover { + if port == fdw.blue_port { + fdw.green_port + } else { + fdw.blue_port + } } else { - fdw.blue_port + port }; + warn!("[fdw] relaunching fdw backend on 0.0.0.0:{}", port); + self.port.store(port, Ordering::Relaxed); - self.ready.store(false, Ordering::Relaxed); + // Use send_modify to ensure value is updated even without receivers. + self.ready.send_modify(|v| *v = false); + self.shutdown_complete.send_modify(|v| *v = false); self.online.store(true, Ordering::Relaxed); - self.restart.notify_waiters(); + self.restart_trigger.send_modify(|v| *v = v.wrapping_add(1)); } /// Wait for Postgres to be ready. pub(crate) async fn wait_ready(&self) { - let ready = self.ready.load(Ordering::Relaxed); + let mut receiver = self.ready.subscribe(); - if ready { + // Check current state first. + if *receiver.borrow() { return; } - let waiter = self.ready_signal.notified(); - let ready = self.ready.load(Ordering::Relaxed); - - if ready { - return; + // Wait for ready to become true. + while receiver.changed().await.is_ok() { + if *receiver.borrow() { + return; + } } - - waiter.await; } fn mark_ready(&self) { - self.ready.store(true, Ordering::Relaxed); - self.ready_signal.notify_waiters(); + self.ready.send_modify(|v| *v = true); } - async fn run(&self) -> Result<(), Error> { - let port = self.port.load(Ordering::Relaxed); + async fn run(&self, restart_receiver: &mut watch::Receiver) -> Result<(), Error> { + let port = self.port(); let mut process = PostgresProcess::new(None, port).await?; - let waiter = process.notify(); + let mut shutdown_receiver = process.shutdown_receiver(); - process.launch().await?; - process.wait_ready().await; + // Use a closure to ensure process is stopped on any exit path. + let result = async { + process.launch().await?; + process.wait_ready().await; - for cluster in databases().all().values() { - if cluster.shards().len() > 1 { - process.configure(cluster).await?; + for cluster in databases().all().values() { + if cluster.shards().len() > 1 { + process.configure(cluster).await?; + } } - } - self.mark_ready(); + self.mark_ready(); - select! { - _ = self.restart.notified() => { - let online = self.online.load(Ordering::Relaxed); - if online { - process.request_stop(); - } else { - process.stop_wait().await; - self.mark_ready(); + select! { + _ = restart_receiver.changed() => { + let online = self.online.load(Ordering::Relaxed); + if online { + process.request_stop(); + } else { + process.stop_wait().await; + } } - } - _ = waiter.notified() => { - // Unexpected exit. + _ = shutdown_receiver.changed() => { + // Process exited (possibly unexpectedly). + // Clear pid to prevent dirty shutdown warning since process already exited. + process.clear_pid(); + } } + + Ok::<(), Error>(()) + } + .await; + + // Ensure process is stopped if we're exiting due to an error. + if result.is_err() { + process.stop_wait().await; + } + + self.mark_shutdown(); + + result + } + + fn mark_shutdown(&self) { + self.shutdown_complete.send_modify(|v| *v = true); + } + + async fn wait_shutdown(&self) { + let mut receiver = self.shutdown_complete.subscribe(); + + // Check current state first. + if *receiver.borrow() { + return; } - Ok(()) + // Wait for shutdown_complete to become true. + while receiver.changed().await.is_ok() { + if *receiver.borrow() { + return; + } + } } } @@ -203,37 +280,85 @@ impl PostgresLauncher { mod test { use super::*; use crate::backend::{pool::Address, ConnectReason, Server, ServerOptions}; + use crate::config::config; #[tokio::test] async fn test_postgres_blue_green() { + use tokio::time::timeout; + crate::logger(); + let fdw = config().config.fdw; + let mut address = Address { host: "127.0.0.1".into(), - port: 6433, + port: fdw.blue_port, user: "postgres".into(), database_name: "postgres".into(), ..Default::default() }; let launcher = PostgresLauncher::get(); - launcher.launch_blue_green(); - launcher.wait_ready().await; + launcher.launch_blue_green(false); + + timeout(Duration::from_secs(10), launcher.wait_ready()) + .await + .expect("timeout waiting for first ready"); + let mut conn = Server::connect(&address, ServerOptions::default(), ConnectReason::default()) .await .unwrap(); conn.execute("SELECT 1").await.unwrap(); drop(conn); - launcher.launch_blue_green(); - launcher.wait_ready().await; - address.port = 6434; + launcher.launch_blue_green(true); + + timeout(Duration::from_secs(10), launcher.wait_ready()) + .await + .expect("timeout waiting for second ready"); + + address.port = fdw.green_port; let mut conn = Server::connect(&address, ServerOptions::default(), ConnectReason::default()) .await .unwrap(); conn.execute("SELECT 1").await.unwrap(); - launcher.shutdown(); - launcher.wait_ready().await; + + timeout(Duration::from_secs(10), launcher.shutdown_wait()) + .await + .expect("timeout waiting for shutdown"); + } + + #[tokio::test] + async fn test_shutdown_without_start() { + use tokio::time::timeout; + + // Test that shutdown_wait() works even if FDW was never started. + // This creates a new launcher directly (not the singleton) to avoid + // interference from other tests. + let (restart_trigger, _) = watch::channel(0u64); + let (ready, _) = watch::channel(false); + let (shutdown_complete, _) = watch::channel(false); + + let launcher = PostgresLauncher { + inner: Arc::new(Inner { + restart_trigger, + online: AtomicBool::new(false), + shutdown_requested: AtomicBool::new(false), + port: AtomicU16::new(6433), + ready, + shutdown_complete, + }), + }; + + launcher.spawn(); + + // Give spawn task time to start waiting + sleep(Duration::from_millis(10)).await; + + // Shutdown without ever starting - should not hang + timeout(Duration::from_secs(5), launcher.shutdown_wait()) + .await + .expect("shutdown_wait() hung when FDW was never started"); } } diff --git a/pgdog/src/backend/fdw/mod.rs b/pgdog/src/backend/fdw/mod.rs index c948f512..236c38d0 100644 --- a/pgdog/src/backend/fdw/mod.rs +++ b/pgdog/src/backend/fdw/mod.rs @@ -7,5 +7,6 @@ pub mod postgres_config; pub use error::Error; pub(crate) use launcher::PostgresLauncher; +pub(crate) use lb::FdwLoadBalancer; pub(crate) use postgres::PostgresProcess; pub(crate) use postgres_config::PostgresConfig; diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index ab640ae6..61d53e2a 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -2,7 +2,6 @@ use std::{ collections::{HashMap, HashSet}, path::{Path, PathBuf}, process::Stdio, - sync::Arc, time::Duration, }; @@ -22,7 +21,7 @@ use tokio::{ io::{AsyncBufReadExt, BufReader}, process::{Child, Command}, select, spawn, - sync::Notify, + sync::watch, time::{sleep, Instant}, }; use tracing::{error, info, warn}; @@ -30,7 +29,7 @@ use tracing::{error, info, warn}; use crate::backend::{ pool::{Address, Config, PoolConfig, Request}, schema::postgres_fdw::{quote_identifier, ForeignTableSchema}, - Cluster, ConnectReason, Pool, Server, ServerOptions, + Cluster, ConnectReason, Server, ServerOptions, }; use super::{bins::Bins, Error, PostgresConfig}; @@ -41,7 +40,8 @@ static LOG_PREFIX: Lazy = struct PostgresProcessAsync { child: Child, initdb_dir: PathBuf, - notify: Arc, + shutdown: watch::Receiver, + shutdown_complete: watch::Sender, version: f32, port: u16, } @@ -83,11 +83,20 @@ impl PostgresProcessAsync { } #[derive(Debug, Clone)] +pub(crate) struct FdwBackend { + pub(crate) config: Config, + pub(crate) address: Address, + pub(crate) database_name: String, + pub(crate) role: Role, +} + +#[derive(Debug)] pub(crate) struct PostgresProcess { postres: PathBuf, initdb: PathBuf, initdb_dir: PathBuf, - notify: Arc, + shutdown: watch::Sender, + shutdown_complete: watch::Sender, port: u16, databases: HashSet, users: HashSet, @@ -97,8 +106,6 @@ pub(crate) struct PostgresProcess { impl PostgresProcess { pub(crate) async fn new(initdb_path: Option<&Path>, port: u16) -> Result { - let notify = Arc::new(Notify::new()); - let initdb_path = if let Some(path) = initdb_path { path.to_owned() } else { @@ -107,11 +114,15 @@ impl PostgresProcess { let bins = Bins::new().await?; + let (shutdown, _) = watch::channel(false); + let (shutdown_complete, _) = watch::channel(false); + Ok(Self { postres: bins.postgres, initdb: bins.initdb, initdb_dir: initdb_path, - notify, + shutdown, + shutdown_complete, port, databases: HashSet::new(), users: HashSet::new(), @@ -155,20 +166,25 @@ impl PostgresProcess { self.version, self.port ); - let child = Command::new(&self.postres) - .arg("-D") + let mut cmd = Command::new(&self.postres); + cmd.arg("-D") .arg(&self.initdb_dir) .arg("-k") .arg(&self.initdb_dir) .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn()?; + .stderr(Stdio::piped()); + + #[cfg(unix)] + cmd.process_group(0); // Prevent sigint from terminal. + + let child = cmd.spawn()?; self.pid = child.id().map(|pid| pid as i32); let mut process = PostgresProcessAsync { child, - notify: self.notify.clone(), + shutdown: self.shutdown.subscribe(), + shutdown_complete: self.shutdown_complete.clone(), initdb_dir: self.initdb_dir.clone(), port: self.port, version: self.version, @@ -196,11 +212,13 @@ impl PostgresProcess { loop { let mut line = String::new(); select! { - _ = process.notify.notified() => { - if let Err(err) = process.stop().await { - error!("[fdw] shutdown error: {}", err); + _ = process.shutdown.changed() => { + if *process.shutdown.borrow() { + if let Err(err) = process.stop().await { + error!("[fdw] shutdown error: {}", err); + } + break; } - break; } _ = process.child.wait() => { @@ -222,59 +240,66 @@ impl PostgresProcess { } } - process.notify.notify_waiters(); + let _ = process.shutdown_complete.send(true); }); Ok(()) } - /// Access the notify channel. - pub(super) fn notify(&self) -> Arc { - self.notify.clone() + /// Get a receiver for shutdown completion notification. + pub(super) fn shutdown_receiver(&self) -> watch::Receiver { + self.shutdown_complete.subscribe() } - pub(super) fn connection_pools(&self, cluster: &Cluster) -> Result, Error> { - Ok(Self::pools_to_databases(cluster, 0)? + pub(crate) fn connection_pool_configs( + port: u16, + cluster: &Cluster, + ) -> Result, Error> { + Ok(Self::pools_to_fdw_backends(cluster, 0)? .into_iter() - .map(|(database, _)| { + .enumerate() + .map(|(database_number, backend)| { let address = Address { host: "127.0.0.1".into(), - port: self.port, - database_name: database.clone(), + port, + database_name: backend.database_name.clone(), password: "".into(), // We use trust user: cluster.identifier().user.clone(), - database_number: 0, + database_number, }; - Pool::new(&PoolConfig { - address, - config: Config { - inner: pgdog_stats::Config { - max: 10, - ..Default::default() - }, + ( + backend.role, + PoolConfig { + address, + config: backend.config, }, - }) + ) }) .collect()) } - pub(super) fn pools_to_databases( + pub(super) fn pools_to_fdw_backends( cluster: &Cluster, shard: usize, - ) -> Result, Error> { + ) -> Result, Error> { let shard = cluster .shards() .get(shard) .ok_or(Error::ShardsHostsMismatch)?; Ok(shard - .pools() - .iter() + .pools_with_roles() + .into_iter() .enumerate() - .map(|(number, pool)| { - let database = format!("{}_{}", cluster.identifier().database, number); - (database, pool.addr().clone()) + .map(|(number, (role, pool))| { + let database_name = format!("{}_{}", cluster.identifier().database, number); + FdwBackend { + config: pool.config().clone(), + address: pool.addr().clone(), + database_name, + role, + } }) .collect()) } @@ -301,12 +326,12 @@ impl PostgresProcess { let mut admin_connection = self.admin_connection().await?; let mut created = false; - for (database, _) in Self::pools_to_databases(cluster, 0)? { - if !self.databases.contains(&database) { + for backend in Self::pools_to_fdw_backends(cluster, 0)? { + if !self.databases.contains(&backend.database_name) { admin_connection .execute(format!( r#"CREATE DATABASE {}"#, - quote_identifier(&database) + quote_identifier(&backend.database_name) )) .await?; created = true; @@ -355,9 +380,9 @@ impl PostgresProcess { let mut connections = HashMap::new(); // We checked that all shards have the same number of replicas. - let databases: Vec<_> = Self::pools_to_databases(cluster, 0)? + let databases: Vec<_> = Self::pools_to_fdw_backends(cluster, 0)? .into_iter() - .map(|(database, _)| database) + .map(|backend| backend.database_name) .collect(); for database in &databases { @@ -374,8 +399,11 @@ impl PostgresProcess { } for (number, _) in cluster.shards().iter().enumerate() { - for (database, address) in Self::pools_to_databases(cluster, number)? { - let identifier = (cluster.identifier().user.clone(), database.clone()); + for backend in Self::pools_to_fdw_backends(cluster, number)? { + let identifier = ( + cluster.identifier().user.clone(), + backend.database_name.clone(), + ); let connection = connections .get_mut(&identifier) .expect("connection is gone"); @@ -386,7 +414,10 @@ impl PostgresProcess { r#"CREATE SERVER IF NOT EXISTS "shard_{}" FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '{}', port '{}', dbname '{}')"#, - number, address.host, address.port, address.database_name, + number, + backend.address.host, + backend.address.port, + backend.address.database_name, )) .await?; } @@ -400,8 +431,8 @@ impl PostgresProcess { OPTIONS (user '{}', password '{}')"#, quote_identifier(&identifier.0), number, - address.user, - address.password + backend.address.user, + backend.address.password )) .await?; } @@ -467,13 +498,34 @@ impl PostgresProcess { } pub(crate) async fn stop_wait(&mut self) { - self.notify.notify_one(); - self.notify.notified().await; + let mut receiver = self.shutdown_complete.subscribe(); + + // Check if already complete (process may have exited). + if *receiver.borrow() { + self.pid.take(); + return; + } + + // Signal shutdown. + self.shutdown.send_modify(|v| *v = true); + + // Wait for shutdown to complete. + while receiver.changed().await.is_ok() { + if *receiver.borrow() { + break; + } + } self.pid.take(); } pub(crate) fn request_stop(&mut self) { - self.notify.notify_one(); + self.shutdown.send_modify(|v| *v = true); + self.pid.take(); + } + + /// Clear the pid to prevent dirty shutdown warning. + /// Used when the process has already exited. + pub(crate) fn clear_pid(&mut self) { self.pid.take(); } } @@ -481,7 +533,7 @@ impl PostgresProcess { impl Drop for PostgresProcess { fn drop(&mut self) { if let Some(pid) = self.pid.take() { - warn!("[fdw] dirty shutdown initiated"); + warn!("[fdw] dirty shutdown initiated for pid {}", pid); #[cfg(unix)] { @@ -539,7 +591,7 @@ mod test { ] ); - let mut server = process.connection("pgdog", "pgdog_p").await.unwrap(); + let mut server = process.connection("pgdog", "pgdog_0").await.unwrap(); server .execute("SELECT * FROM pgdog.test_postgres_process") .await diff --git a/pgdog/src/backend/pool/connection/mod.rs b/pgdog/src/backend/pool/connection/mod.rs index 7ea42794..ca8369f5 100644 --- a/pgdog/src/backend/pool/connection/mod.rs +++ b/pgdog/src/backend/pool/connection/mod.rs @@ -140,7 +140,9 @@ impl Connection { /// Try to get a connection for the given route. async fn try_conn(&mut self, request: &Request, route: &Route) -> Result<(), Error> { if let Shard::Direct(shard) = route.shard() { - let mut server = if route.is_read() { + let mut server = if route.is_fdw_fallback() { + self.cluster()?.primary_fdw(request).await? + } else if route.is_read() { self.cluster()?.replica(*shard, request).await? } else { self.cluster()?.primary(*shard, request).await? diff --git a/pgdog/src/backend/pool/error.rs b/pgdog/src/backend/pool/error.rs index c1fddccb..edab5441 100644 --- a/pgdog/src/backend/pool/error.rs +++ b/pgdog/src/backend/pool/error.rs @@ -50,6 +50,9 @@ pub enum Error { #[error("no databases")] NoDatabases, + #[error("fdw backend not configured")] + NoFdw, + #[error("config values contain null bytes")] NullBytes, diff --git a/pgdog/src/frontend/client/query_engine/connect.rs b/pgdog/src/frontend/client/query_engine/connect.rs index 240fdbbf..2b5e2466 100644 --- a/pgdog/src/frontend/client/query_engine/connect.rs +++ b/pgdog/src/frontend/client/query_engine/connect.rs @@ -1,3 +1,4 @@ +use lazy_static::lazy_static; use tokio::time::timeout; use crate::frontend::router::parser::ShardWithPriority; @@ -29,6 +30,15 @@ impl QueryEngine { } let connect_route = connect_route.unwrap_or(context.client_request.route()); + let connect_route = if context.params.is_postgres_fdw() { + lazy_static! { + static ref FDW_ROUTE: Route = Route::fdw_fallback(); + } + + &FDW_ROUTE + } else { + connect_route + }; let request = Request::new(*context.id, connect_route.is_read()); diff --git a/pgdog/src/frontend/client/query_engine/route_query.rs b/pgdog/src/frontend/client/query_engine/route_query.rs index 38875dcf..855b6c63 100644 --- a/pgdog/src/frontend/client/query_engine/route_query.rs +++ b/pgdog/src/frontend/client/query_engine/route_query.rs @@ -88,6 +88,7 @@ impl QueryEngine { context.transaction, context.sticky, )?; + match self.router.query(router_context) { Ok(command) => { context.client_request.route = Some(command.route().clone()); diff --git a/pgdog/src/frontend/listener.rs b/pgdog/src/frontend/listener.rs index 1fc92946..2274a615 100644 --- a/pgdog/src/frontend/listener.rs +++ b/pgdog/src/frontend/listener.rs @@ -151,11 +151,11 @@ impl Listener { } } - self.shutdown.notify_waiters(); - if let Err(_) = timeout(shutdown_timeout, PostgresLauncher::get().shutdown_wait()).await { error!("[fdw] graceful shutdown failed"); } + + self.shutdown.notify_waiters(); } async fn handle_client(stream: TcpStream, addr: SocketAddr) -> Result<(), Error> { diff --git a/pgdog/src/frontend/router/parser/route.rs b/pgdog/src/frontend/router/parser/route.rs index 4505c053..f1de4b3f 100644 --- a/pgdog/src/frontend/router/parser/route.rs +++ b/pgdog/src/frontend/router/parser/route.rs @@ -90,6 +90,7 @@ pub struct Route { rollback_savepoint: bool, search_path_driven: bool, schema_changed: bool, + fdw_fallback: bool, } impl Display for Route { @@ -140,6 +141,15 @@ impl Route { } } + /// Create new fdw fallback route. + pub fn fdw_fallback() -> Self { + Self { + shard: ShardWithPriority::new_override_fdw_fallback(), + fdw_fallback: true, + ..Default::default() + } + } + /// Returns true if this is a query that /// can be sent to a replica. pub fn is_read(&self) -> bool { @@ -152,6 +162,10 @@ impl Route { !self.is_read() } + pub fn is_fdw_fallback(&self) -> bool { + self.fdw_fallback + } + /// Get shard if any. pub fn shard(&self) -> &Shard { &self.shard @@ -345,6 +359,7 @@ pub enum OverrideReason { Transaction, OnlyOneShard, RewriteUpdate, + FdwFallback, } #[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)] @@ -426,6 +441,13 @@ impl ShardWithPriority { } } + pub fn new_override_fdw_fallback() -> Self { + Self { + shard: Shard::Direct(0), + source: ShardSource::Override(OverrideReason::FdwFallback), + } + } + pub fn new_default_unset(shard: Shard) -> Self { Self { shard, diff --git a/pgdog/src/net/parameter.rs b/pgdog/src/net/parameter.rs index a9db66c6..001bb6f2 100644 --- a/pgdog/src/net/parameter.rs +++ b/pgdog/src/net/parameter.rs @@ -405,6 +405,12 @@ impl Parameters { pub fn search_path(&self) -> Option<&ParameterValue> { self.get("search_path") } + + pub fn is_postgres_fdw(&self) -> bool { + self.get("pgdog.backend") + .map(|p| p.as_str() == Some("fdw")) + .unwrap_or_default() + } } impl Deref for Parameters { From b9de14a284df9cbc4fb0fe3988435ab21b65869b Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Tue, 3 Feb 2026 17:47:30 -0800 Subject: [PATCH 12/29] save --- pgdog-plugin/src/bindings.rs | 470 +++++++++++++++++++----------- pgdog/src/backend/fdw/lb.rs | 55 +++- pgdog/src/backend/fdw/postgres.rs | 1 + pgdog/src/backend/pool/cluster.rs | 47 ++- 4 files changed, 389 insertions(+), 184 deletions(-) diff --git a/pgdog-plugin/src/bindings.rs b/pgdog-plugin/src/bindings.rs index 561d24e5..6f47703d 100644 --- a/pgdog-plugin/src/bindings.rs +++ b/pgdog-plugin/src/bindings.rs @@ -1,213 +1,338 @@ /* automatically generated by rust-bindgen 0.71.1 */ -pub const _STDINT_H: u32 = 1; -pub const _FEATURES_H: u32 = 1; -pub const _DEFAULT_SOURCE: u32 = 1; -pub const __GLIBC_USE_ISOC2Y: u32 = 0; -pub const __GLIBC_USE_ISOC23: u32 = 0; -pub const __USE_ISOC11: u32 = 1; -pub const __USE_ISOC99: u32 = 1; -pub const __USE_ISOC95: u32 = 1; -pub const __USE_POSIX_IMPLICITLY: u32 = 1; -pub const _POSIX_SOURCE: u32 = 1; -pub const _POSIX_C_SOURCE: u32 = 200809; -pub const __USE_POSIX: u32 = 1; -pub const __USE_POSIX2: u32 = 1; -pub const __USE_POSIX199309: u32 = 1; -pub const __USE_POSIX199506: u32 = 1; -pub const __USE_XOPEN2K: u32 = 1; -pub const __USE_XOPEN2K8: u32 = 1; -pub const _ATFILE_SOURCE: u32 = 1; pub const __WORDSIZE: u32 = 64; -pub const __WORDSIZE_TIME64_COMPAT32: u32 = 1; -pub const __SYSCALL_WORDSIZE: u32 = 64; -pub const __TIMESIZE: u32 = 64; -pub const __USE_TIME_BITS64: u32 = 1; -pub const __USE_MISC: u32 = 1; -pub const __USE_ATFILE: u32 = 1; -pub const __USE_FORTIFY_LEVEL: u32 = 0; -pub const __GLIBC_USE_DEPRECATED_GETS: u32 = 0; -pub const __GLIBC_USE_DEPRECATED_SCANF: u32 = 0; -pub const __GLIBC_USE_C23_STRTOL: u32 = 0; -pub const _STDC_PREDEF_H: u32 = 1; -pub const __STDC_IEC_559__: u32 = 1; -pub const __STDC_IEC_60559_BFP__: u32 = 201404; -pub const __STDC_IEC_559_COMPLEX__: u32 = 1; -pub const __STDC_IEC_60559_COMPLEX__: u32 = 201404; -pub const __STDC_ISO_10646__: u32 = 201706; -pub const __GNU_LIBRARY__: u32 = 6; -pub const __GLIBC__: u32 = 2; -pub const __GLIBC_MINOR__: u32 = 42; -pub const _SYS_CDEFS_H: u32 = 1; -pub const __glibc_c99_flexarr_available: u32 = 1; -pub const __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI: u32 = 0; -pub const __HAVE_GENERIC_SELECTION: u32 = 1; -pub const __GLIBC_USE_LIB_EXT2: u32 = 0; -pub const __GLIBC_USE_IEC_60559_BFP_EXT: u32 = 0; -pub const __GLIBC_USE_IEC_60559_BFP_EXT_C23: u32 = 0; -pub const __GLIBC_USE_IEC_60559_EXT: u32 = 0; -pub const __GLIBC_USE_IEC_60559_FUNCS_EXT: u32 = 0; -pub const __GLIBC_USE_IEC_60559_FUNCS_EXT_C23: u32 = 0; -pub const __GLIBC_USE_IEC_60559_TYPES_EXT: u32 = 0; -pub const _BITS_TYPES_H: u32 = 1; -pub const _BITS_TYPESIZES_H: u32 = 1; -pub const __OFF_T_MATCHES_OFF64_T: u32 = 1; -pub const __INO_T_MATCHES_INO64_T: u32 = 1; -pub const __RLIM_T_MATCHES_RLIM64_T: u32 = 1; -pub const __STATFS_MATCHES_STATFS64: u32 = 1; -pub const __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64: u32 = 1; -pub const __FD_SETSIZE: u32 = 1024; -pub const _BITS_TIME64_H: u32 = 1; -pub const _BITS_WCHAR_H: u32 = 1; -pub const _BITS_STDINT_INTN_H: u32 = 1; -pub const _BITS_STDINT_UINTN_H: u32 = 1; -pub const _BITS_STDINT_LEAST_H: u32 = 1; -pub const INT8_MIN: i32 = -128; -pub const INT16_MIN: i32 = -32768; -pub const INT32_MIN: i32 = -2147483648; +pub const __has_safe_buffers: u32 = 1; +pub const __DARWIN_ONLY_64_BIT_INO_T: u32 = 1; +pub const __DARWIN_ONLY_UNIX_CONFORMANCE: u32 = 1; +pub const __DARWIN_ONLY_VERS_1050: u32 = 1; +pub const __DARWIN_UNIX03: u32 = 1; +pub const __DARWIN_64_BIT_INO_T: u32 = 1; +pub const __DARWIN_VERS_1050: u32 = 1; +pub const __DARWIN_NON_CANCELABLE: u32 = 0; +pub const __DARWIN_SUF_EXTSN: &[u8; 14] = b"$DARWIN_EXTSN\0"; +pub const __DARWIN_C_ANSI: u32 = 4096; +pub const __DARWIN_C_FULL: u32 = 900000; +pub const __DARWIN_C_LEVEL: u32 = 900000; +pub const __STDC_WANT_LIB_EXT1__: u32 = 1; +pub const __DARWIN_NO_LONG_LONG: u32 = 0; +pub const _DARWIN_FEATURE_64_BIT_INODE: u32 = 1; +pub const _DARWIN_FEATURE_ONLY_64_BIT_INODE: u32 = 1; +pub const _DARWIN_FEATURE_ONLY_VERS_1050: u32 = 1; +pub const _DARWIN_FEATURE_ONLY_UNIX_CONFORMANCE: u32 = 1; +pub const _DARWIN_FEATURE_UNIX_CONFORMANCE: u32 = 3; +pub const __has_ptrcheck: u32 = 0; +pub const USE_CLANG_TYPES: u32 = 0; +pub const __PTHREAD_SIZE__: u32 = 8176; +pub const __PTHREAD_ATTR_SIZE__: u32 = 56; +pub const __PTHREAD_MUTEXATTR_SIZE__: u32 = 8; +pub const __PTHREAD_MUTEX_SIZE__: u32 = 56; +pub const __PTHREAD_CONDATTR_SIZE__: u32 = 8; +pub const __PTHREAD_COND_SIZE__: u32 = 40; +pub const __PTHREAD_ONCE_SIZE__: u32 = 8; +pub const __PTHREAD_RWLOCK_SIZE__: u32 = 192; +pub const __PTHREAD_RWLOCKATTR_SIZE__: u32 = 16; pub const INT8_MAX: u32 = 127; pub const INT16_MAX: u32 = 32767; pub const INT32_MAX: u32 = 2147483647; +pub const INT64_MAX: u64 = 9223372036854775807; +pub const INT8_MIN: i32 = -128; +pub const INT16_MIN: i32 = -32768; +pub const INT32_MIN: i32 = -2147483648; +pub const INT64_MIN: i64 = -9223372036854775808; pub const UINT8_MAX: u32 = 255; pub const UINT16_MAX: u32 = 65535; pub const UINT32_MAX: u32 = 4294967295; +pub const UINT64_MAX: i32 = -1; pub const INT_LEAST8_MIN: i32 = -128; pub const INT_LEAST16_MIN: i32 = -32768; pub const INT_LEAST32_MIN: i32 = -2147483648; +pub const INT_LEAST64_MIN: i64 = -9223372036854775808; pub const INT_LEAST8_MAX: u32 = 127; pub const INT_LEAST16_MAX: u32 = 32767; pub const INT_LEAST32_MAX: u32 = 2147483647; +pub const INT_LEAST64_MAX: u64 = 9223372036854775807; pub const UINT_LEAST8_MAX: u32 = 255; pub const UINT_LEAST16_MAX: u32 = 65535; pub const UINT_LEAST32_MAX: u32 = 4294967295; +pub const UINT_LEAST64_MAX: i32 = -1; pub const INT_FAST8_MIN: i32 = -128; -pub const INT_FAST16_MIN: i64 = -9223372036854775808; -pub const INT_FAST32_MIN: i64 = -9223372036854775808; +pub const INT_FAST16_MIN: i32 = -32768; +pub const INT_FAST32_MIN: i32 = -2147483648; +pub const INT_FAST64_MIN: i64 = -9223372036854775808; pub const INT_FAST8_MAX: u32 = 127; -pub const INT_FAST16_MAX: u64 = 9223372036854775807; -pub const INT_FAST32_MAX: u64 = 9223372036854775807; +pub const INT_FAST16_MAX: u32 = 32767; +pub const INT_FAST32_MAX: u32 = 2147483647; +pub const INT_FAST64_MAX: u64 = 9223372036854775807; pub const UINT_FAST8_MAX: u32 = 255; -pub const UINT_FAST16_MAX: i32 = -1; -pub const UINT_FAST32_MAX: i32 = -1; -pub const INTPTR_MIN: i64 = -9223372036854775808; +pub const UINT_FAST16_MAX: u32 = 65535; +pub const UINT_FAST32_MAX: u32 = 4294967295; +pub const UINT_FAST64_MAX: i32 = -1; pub const INTPTR_MAX: u64 = 9223372036854775807; +pub const INTPTR_MIN: i64 = -9223372036854775808; pub const UINTPTR_MAX: i32 = -1; -pub const PTRDIFF_MIN: i64 = -9223372036854775808; -pub const PTRDIFF_MAX: u64 = 9223372036854775807; +pub const SIZE_MAX: i32 = -1; +pub const RSIZE_MAX: i32 = -1; +pub const WINT_MIN: i32 = -2147483648; +pub const WINT_MAX: u32 = 2147483647; pub const SIG_ATOMIC_MIN: i32 = -2147483648; pub const SIG_ATOMIC_MAX: u32 = 2147483647; -pub const SIZE_MAX: i32 = -1; -pub const WINT_MIN: u32 = 0; -pub const WINT_MAX: u32 = 4294967295; pub type wchar_t = ::std::os::raw::c_int; -#[repr(C)] -#[repr(align(16))] -#[derive(Debug, Copy, Clone)] -pub struct max_align_t { - pub __clang_max_align_nonce1: ::std::os::raw::c_longlong, - pub __bindgen_padding_0: u64, - pub __clang_max_align_nonce2: u128, -} -#[allow(clippy::unnecessary_operation, clippy::identity_op)] -const _: () = { - ["Size of max_align_t"][::std::mem::size_of::() - 32usize]; - ["Alignment of max_align_t"][::std::mem::align_of::() - 16usize]; - ["Offset of field: max_align_t::__clang_max_align_nonce1"] - [::std::mem::offset_of!(max_align_t, __clang_max_align_nonce1) - 0usize]; - ["Offset of field: max_align_t::__clang_max_align_nonce2"] - [::std::mem::offset_of!(max_align_t, __clang_max_align_nonce2) - 16usize]; -}; -pub type __u_char = ::std::os::raw::c_uchar; -pub type __u_short = ::std::os::raw::c_ushort; -pub type __u_int = ::std::os::raw::c_uint; -pub type __u_long = ::std::os::raw::c_ulong; +pub type max_align_t = f64; +pub type int_least8_t = i8; +pub type int_least16_t = i16; +pub type int_least32_t = i32; +pub type int_least64_t = i64; +pub type uint_least8_t = u8; +pub type uint_least16_t = u16; +pub type uint_least32_t = u32; +pub type uint_least64_t = u64; +pub type int_fast8_t = i8; +pub type int_fast16_t = i16; +pub type int_fast32_t = i32; +pub type int_fast64_t = i64; +pub type uint_fast8_t = u8; +pub type uint_fast16_t = u16; +pub type uint_fast32_t = u32; +pub type uint_fast64_t = u64; pub type __int8_t = ::std::os::raw::c_schar; pub type __uint8_t = ::std::os::raw::c_uchar; pub type __int16_t = ::std::os::raw::c_short; pub type __uint16_t = ::std::os::raw::c_ushort; pub type __int32_t = ::std::os::raw::c_int; pub type __uint32_t = ::std::os::raw::c_uint; -pub type __int64_t = ::std::os::raw::c_long; -pub type __uint64_t = ::std::os::raw::c_ulong; -pub type __int_least8_t = __int8_t; -pub type __uint_least8_t = __uint8_t; -pub type __int_least16_t = __int16_t; -pub type __uint_least16_t = __uint16_t; -pub type __int_least32_t = __int32_t; -pub type __uint_least32_t = __uint32_t; -pub type __int_least64_t = __int64_t; -pub type __uint_least64_t = __uint64_t; -pub type __quad_t = ::std::os::raw::c_long; -pub type __u_quad_t = ::std::os::raw::c_ulong; -pub type __intmax_t = ::std::os::raw::c_long; -pub type __uintmax_t = ::std::os::raw::c_ulong; -pub type __dev_t = ::std::os::raw::c_ulong; -pub type __uid_t = ::std::os::raw::c_uint; -pub type __gid_t = ::std::os::raw::c_uint; -pub type __ino_t = ::std::os::raw::c_ulong; -pub type __ino64_t = ::std::os::raw::c_ulong; -pub type __mode_t = ::std::os::raw::c_uint; -pub type __nlink_t = ::std::os::raw::c_ulong; -pub type __off_t = ::std::os::raw::c_long; -pub type __off64_t = ::std::os::raw::c_long; -pub type __pid_t = ::std::os::raw::c_int; +pub type __int64_t = ::std::os::raw::c_longlong; +pub type __uint64_t = ::std::os::raw::c_ulonglong; +pub type __darwin_intptr_t = ::std::os::raw::c_long; +pub type __darwin_natural_t = ::std::os::raw::c_uint; +pub type __darwin_ct_rune_t = ::std::os::raw::c_int; +#[repr(C)] +#[derive(Copy, Clone)] +pub union __mbstate_t { + pub __mbstate8: [::std::os::raw::c_char; 128usize], + pub _mbstateL: ::std::os::raw::c_longlong, +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of __mbstate_t"][::std::mem::size_of::<__mbstate_t>() - 128usize]; + ["Alignment of __mbstate_t"][::std::mem::align_of::<__mbstate_t>() - 8usize]; + ["Offset of field: __mbstate_t::__mbstate8"] + [::std::mem::offset_of!(__mbstate_t, __mbstate8) - 0usize]; + ["Offset of field: __mbstate_t::_mbstateL"] + [::std::mem::offset_of!(__mbstate_t, _mbstateL) - 0usize]; +}; +pub type __darwin_mbstate_t = __mbstate_t; +pub type __darwin_ptrdiff_t = ::std::os::raw::c_long; +pub type __darwin_size_t = ::std::os::raw::c_ulong; +pub type __darwin_va_list = __builtin_va_list; +pub type __darwin_wchar_t = ::std::os::raw::c_int; +pub type __darwin_rune_t = __darwin_wchar_t; +pub type __darwin_wint_t = ::std::os::raw::c_int; +pub type __darwin_clock_t = ::std::os::raw::c_ulong; +pub type __darwin_socklen_t = __uint32_t; +pub type __darwin_ssize_t = ::std::os::raw::c_long; +pub type __darwin_time_t = ::std::os::raw::c_long; +pub type __darwin_blkcnt_t = __int64_t; +pub type __darwin_blksize_t = __int32_t; +pub type __darwin_dev_t = __int32_t; +pub type __darwin_fsblkcnt_t = ::std::os::raw::c_uint; +pub type __darwin_fsfilcnt_t = ::std::os::raw::c_uint; +pub type __darwin_gid_t = __uint32_t; +pub type __darwin_id_t = __uint32_t; +pub type __darwin_ino64_t = __uint64_t; +pub type __darwin_ino_t = __darwin_ino64_t; +pub type __darwin_mach_port_name_t = __darwin_natural_t; +pub type __darwin_mach_port_t = __darwin_mach_port_name_t; +pub type __darwin_mode_t = __uint16_t; +pub type __darwin_off_t = __int64_t; +pub type __darwin_pid_t = __int32_t; +pub type __darwin_sigset_t = __uint32_t; +pub type __darwin_suseconds_t = __int32_t; +pub type __darwin_uid_t = __uint32_t; +pub type __darwin_useconds_t = __uint32_t; +pub type __darwin_uuid_t = [::std::os::raw::c_uchar; 16usize]; +pub type __darwin_uuid_string_t = [::std::os::raw::c_char; 37usize]; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct __darwin_pthread_handler_rec { + pub __routine: ::std::option::Option, + pub __arg: *mut ::std::os::raw::c_void, + pub __next: *mut __darwin_pthread_handler_rec, +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of __darwin_pthread_handler_rec"] + [::std::mem::size_of::<__darwin_pthread_handler_rec>() - 24usize]; + ["Alignment of __darwin_pthread_handler_rec"] + [::std::mem::align_of::<__darwin_pthread_handler_rec>() - 8usize]; + ["Offset of field: __darwin_pthread_handler_rec::__routine"] + [::std::mem::offset_of!(__darwin_pthread_handler_rec, __routine) - 0usize]; + ["Offset of field: __darwin_pthread_handler_rec::__arg"] + [::std::mem::offset_of!(__darwin_pthread_handler_rec, __arg) - 8usize]; + ["Offset of field: __darwin_pthread_handler_rec::__next"] + [::std::mem::offset_of!(__darwin_pthread_handler_rec, __next) - 16usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_attr_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 56usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_attr_t"][::std::mem::size_of::<_opaque_pthread_attr_t>() - 64usize]; + ["Alignment of _opaque_pthread_attr_t"] + [::std::mem::align_of::<_opaque_pthread_attr_t>() - 8usize]; + ["Offset of field: _opaque_pthread_attr_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_attr_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_attr_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_attr_t, __opaque) - 8usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_cond_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 40usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_cond_t"][::std::mem::size_of::<_opaque_pthread_cond_t>() - 48usize]; + ["Alignment of _opaque_pthread_cond_t"] + [::std::mem::align_of::<_opaque_pthread_cond_t>() - 8usize]; + ["Offset of field: _opaque_pthread_cond_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_cond_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_cond_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_cond_t, __opaque) - 8usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_condattr_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 8usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_condattr_t"] + [::std::mem::size_of::<_opaque_pthread_condattr_t>() - 16usize]; + ["Alignment of _opaque_pthread_condattr_t"] + [::std::mem::align_of::<_opaque_pthread_condattr_t>() - 8usize]; + ["Offset of field: _opaque_pthread_condattr_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_condattr_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_condattr_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_condattr_t, __opaque) - 8usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_mutex_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 56usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_mutex_t"][::std::mem::size_of::<_opaque_pthread_mutex_t>() - 64usize]; + ["Alignment of _opaque_pthread_mutex_t"] + [::std::mem::align_of::<_opaque_pthread_mutex_t>() - 8usize]; + ["Offset of field: _opaque_pthread_mutex_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_mutex_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_mutex_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_mutex_t, __opaque) - 8usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_mutexattr_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 8usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_mutexattr_t"] + [::std::mem::size_of::<_opaque_pthread_mutexattr_t>() - 16usize]; + ["Alignment of _opaque_pthread_mutexattr_t"] + [::std::mem::align_of::<_opaque_pthread_mutexattr_t>() - 8usize]; + ["Offset of field: _opaque_pthread_mutexattr_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_mutexattr_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_mutexattr_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_mutexattr_t, __opaque) - 8usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_once_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 8usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_once_t"][::std::mem::size_of::<_opaque_pthread_once_t>() - 16usize]; + ["Alignment of _opaque_pthread_once_t"] + [::std::mem::align_of::<_opaque_pthread_once_t>() - 8usize]; + ["Offset of field: _opaque_pthread_once_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_once_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_once_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_once_t, __opaque) - 8usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_rwlock_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 192usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_rwlock_t"] + [::std::mem::size_of::<_opaque_pthread_rwlock_t>() - 200usize]; + ["Alignment of _opaque_pthread_rwlock_t"] + [::std::mem::align_of::<_opaque_pthread_rwlock_t>() - 8usize]; + ["Offset of field: _opaque_pthread_rwlock_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_rwlock_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_rwlock_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_rwlock_t, __opaque) - 8usize]; +}; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct _opaque_pthread_rwlockattr_t { + pub __sig: ::std::os::raw::c_long, + pub __opaque: [::std::os::raw::c_char; 16usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of _opaque_pthread_rwlockattr_t"] + [::std::mem::size_of::<_opaque_pthread_rwlockattr_t>() - 24usize]; + ["Alignment of _opaque_pthread_rwlockattr_t"] + [::std::mem::align_of::<_opaque_pthread_rwlockattr_t>() - 8usize]; + ["Offset of field: _opaque_pthread_rwlockattr_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_rwlockattr_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_rwlockattr_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_rwlockattr_t, __opaque) - 8usize]; +}; #[repr(C)] #[derive(Debug, Copy, Clone)] -pub struct __fsid_t { - pub __val: [::std::os::raw::c_int; 2usize], +pub struct _opaque_pthread_t { + pub __sig: ::std::os::raw::c_long, + pub __cleanup_stack: *mut __darwin_pthread_handler_rec, + pub __opaque: [::std::os::raw::c_char; 8176usize], } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { - ["Size of __fsid_t"][::std::mem::size_of::<__fsid_t>() - 8usize]; - ["Alignment of __fsid_t"][::std::mem::align_of::<__fsid_t>() - 4usize]; - ["Offset of field: __fsid_t::__val"][::std::mem::offset_of!(__fsid_t, __val) - 0usize]; + ["Size of _opaque_pthread_t"][::std::mem::size_of::<_opaque_pthread_t>() - 8192usize]; + ["Alignment of _opaque_pthread_t"][::std::mem::align_of::<_opaque_pthread_t>() - 8usize]; + ["Offset of field: _opaque_pthread_t::__sig"] + [::std::mem::offset_of!(_opaque_pthread_t, __sig) - 0usize]; + ["Offset of field: _opaque_pthread_t::__cleanup_stack"] + [::std::mem::offset_of!(_opaque_pthread_t, __cleanup_stack) - 8usize]; + ["Offset of field: _opaque_pthread_t::__opaque"] + [::std::mem::offset_of!(_opaque_pthread_t, __opaque) - 16usize]; }; -pub type __clock_t = ::std::os::raw::c_long; -pub type __rlim_t = ::std::os::raw::c_ulong; -pub type __rlim64_t = ::std::os::raw::c_ulong; -pub type __id_t = ::std::os::raw::c_uint; -pub type __time_t = ::std::os::raw::c_long; -pub type __useconds_t = ::std::os::raw::c_uint; -pub type __suseconds_t = ::std::os::raw::c_long; -pub type __suseconds64_t = ::std::os::raw::c_long; -pub type __daddr_t = ::std::os::raw::c_int; -pub type __key_t = ::std::os::raw::c_int; -pub type __clockid_t = ::std::os::raw::c_int; -pub type __timer_t = *mut ::std::os::raw::c_void; -pub type __blksize_t = ::std::os::raw::c_long; -pub type __blkcnt_t = ::std::os::raw::c_long; -pub type __blkcnt64_t = ::std::os::raw::c_long; -pub type __fsblkcnt_t = ::std::os::raw::c_ulong; -pub type __fsblkcnt64_t = ::std::os::raw::c_ulong; -pub type __fsfilcnt_t = ::std::os::raw::c_ulong; -pub type __fsfilcnt64_t = ::std::os::raw::c_ulong; -pub type __fsword_t = ::std::os::raw::c_long; -pub type __ssize_t = ::std::os::raw::c_long; -pub type __syscall_slong_t = ::std::os::raw::c_long; -pub type __syscall_ulong_t = ::std::os::raw::c_ulong; -pub type __loff_t = __off64_t; -pub type __caddr_t = *mut ::std::os::raw::c_char; -pub type __intptr_t = ::std::os::raw::c_long; -pub type __socklen_t = ::std::os::raw::c_uint; -pub type __sig_atomic_t = ::std::os::raw::c_int; -pub type int_least8_t = __int_least8_t; -pub type int_least16_t = __int_least16_t; -pub type int_least32_t = __int_least32_t; -pub type int_least64_t = __int_least64_t; -pub type uint_least8_t = __uint_least8_t; -pub type uint_least16_t = __uint_least16_t; -pub type uint_least32_t = __uint_least32_t; -pub type uint_least64_t = __uint_least64_t; -pub type int_fast8_t = ::std::os::raw::c_schar; -pub type int_fast16_t = ::std::os::raw::c_long; -pub type int_fast32_t = ::std::os::raw::c_long; -pub type int_fast64_t = ::std::os::raw::c_long; -pub type uint_fast8_t = ::std::os::raw::c_uchar; -pub type uint_fast16_t = ::std::os::raw::c_ulong; -pub type uint_fast32_t = ::std::os::raw::c_ulong; -pub type uint_fast64_t = ::std::os::raw::c_ulong; -pub type intmax_t = __intmax_t; -pub type uintmax_t = __uintmax_t; +pub type __darwin_pthread_attr_t = _opaque_pthread_attr_t; +pub type __darwin_pthread_cond_t = _opaque_pthread_cond_t; +pub type __darwin_pthread_condattr_t = _opaque_pthread_condattr_t; +pub type __darwin_pthread_key_t = ::std::os::raw::c_ulong; +pub type __darwin_pthread_mutex_t = _opaque_pthread_mutex_t; +pub type __darwin_pthread_mutexattr_t = _opaque_pthread_mutexattr_t; +pub type __darwin_pthread_once_t = _opaque_pthread_once_t; +pub type __darwin_pthread_rwlock_t = _opaque_pthread_rwlock_t; +pub type __darwin_pthread_rwlockattr_t = _opaque_pthread_rwlockattr_t; +pub type __darwin_pthread_t = *mut _opaque_pthread_t; +pub type intmax_t = ::std::os::raw::c_long; +pub type uintmax_t = ::std::os::raw::c_ulong; #[doc = " Wrapper around Rust's [`&str`], without allocating memory, unlike [`std::ffi::CString`].\n The caller must use it as a Rust string. This is not a C-string."] #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -324,3 +449,4 @@ const _: () = { ["Offset of field: PdRoute::shard"][::std::mem::offset_of!(PdRoute, shard) - 0usize]; ["Offset of field: PdRoute::read_write"][::std::mem::offset_of!(PdRoute, read_write) - 8usize]; }; +pub type __builtin_va_list = *mut ::std::os::raw::c_char; diff --git a/pgdog/src/backend/fdw/lb.rs b/pgdog/src/backend/fdw/lb.rs index 36c9c240..0ad66f3e 100644 --- a/pgdog/src/backend/fdw/lb.rs +++ b/pgdog/src/backend/fdw/lb.rs @@ -1,22 +1,57 @@ +use std::sync::Arc; + +use pgdog_config::Role; +use tokio::spawn; + +use crate::backend::fdw::PostgresLauncher; +use crate::backend::pool::{Guard, Request}; use crate::backend::{Cluster, LoadBalancer, Pool}; use super::Error; use super::PostgresProcess; +#[derive(Clone, Debug)] pub(crate) struct FdwLoadBalancer { - lb: LoadBalancer, + lb: Arc, } impl FdwLoadBalancer { pub(crate) fn new(cluster: &Cluster) -> Result { - // We check that all shards have identical configs - // before enabling this feature. - let pools = PostgresProcess::pools_to_databases(cluster, 0)?; - let pools = cluster - .shards() - .get(0) - .ok_or(Error::ShardsHostsMismatch)? - .pools_with_roles(); - todo!() + let port = PostgresLauncher::get().port(); + let configs = PostgresProcess::connection_pool_configs(port, cluster)?; + let primary = configs + .iter() + .find(|p| p.0 == Role::Primary) + .map(|p| Pool::new(&p.1)); + let addrs: Vec<_> = configs.iter().map(|c| c.1.clone()).collect(); + + let lb = Arc::new(LoadBalancer::new( + &primary, + &addrs, + cluster.lb_strategy(), + cluster.rw_split(), + )); + + Ok(Self { lb }) + } + + pub(crate) fn launch(&self) { + let lb = self.lb.clone(); + spawn(async move { + let launcher = PostgresLauncher::get(); + launcher.wait_ready().await; + lb.launch(); + }); + } + + pub(crate) fn primary(&self) -> Option { + self.lb.primary().cloned() + } + + pub(crate) async fn get( + &self, + request: &Request, + ) -> Result { + self.lb.get(request).await } } diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index 61d53e2a..d64f7c27 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -73,6 +73,7 @@ impl PostgresProcessAsync { } /// Force stop immediately. + #[allow(dead_code)] async fn force_stop(&mut self) -> Result<(), Error> { self.child.kill().await?; self.child.wait().await?; diff --git a/pgdog/src/backend/pool/cluster.rs b/pgdog/src/backend/pool/cluster.rs index b58d0694..1ff178aa 100644 --- a/pgdog/src/backend/pool/cluster.rs +++ b/pgdog/src/backend/pool/cluster.rs @@ -17,6 +17,7 @@ use tracing::{error, info}; use crate::{ backend::{ databases::{databases, User as DatabaseUser}, + fdw::FdwLoadBalancer, pool::ee::schema_changed_hook, replication::{ReplicationConfig, ShardedSchemas}, Schema, ShardedTables, @@ -78,6 +79,9 @@ pub struct Cluster { query_parser_engine: QueryParserEngine, reload_schema_on_ddl: bool, load_schema: LoadSchema, + lb_strategy: LoadBalancingStrategy, + rw_split: ReadWriteSplit, + fdw_lb: Option, } /// Sharding configuration from the cluster. @@ -246,7 +250,7 @@ impl Cluster { database: name.to_owned(), }); - Self { + let mut cluster = Self { identifier: identifier.clone(), shards: shards .iter() @@ -287,7 +291,14 @@ impl Cluster { query_parser_engine, reload_schema_on_ddl, load_schema, - } + lb_strategy, + rw_split, + fdw_lb: None, + }; + + cluster.fdw_lb = FdwLoadBalancer::new(&cluster).ok(); + + cluster } /// Change config to work with logical replication streaming. @@ -312,6 +323,24 @@ impl Cluster { shard.replica(request).await } + /// Get a connection from the primary fdw conn pool. + pub async fn primary_fdw(&self, request: &Request) -> Result { + if let Some(ref lb) = self.fdw_lb { + Ok(lb.primary().ok_or(Error::NoPrimary)?.get(request).await?) + } else { + Err(Error::NoFdw) + } + } + + /// Get a connection from one of the replica fdw pools. + pub async fn replica_fdw(&self, request: &Request) -> Result { + if let Some(ref lb) = self.fdw_lb { + lb.get(request).await + } else { + Err(Error::NoFdw) + } + } + /// Get a connection to either a primary or a replica. pub async fn primary_or_replica( &self, @@ -358,6 +387,16 @@ impl Cluster { &self.shards } + /// Get the load balancing strategy. + pub fn lb_strategy(&self) -> LoadBalancingStrategy { + self.lb_strategy + } + + /// Get the read/write split strategy. + pub fn rw_split(&self) -> ReadWriteSplit { + self.rw_split + } + /// Get the password the user should use to connect to the database. pub fn password(&self) -> &str { &self.password @@ -548,6 +587,10 @@ impl Cluster { shard.launch(); } + if let Some(ref fdw_lb) = self.fdw_lb { + fdw_lb.launch(); + } + // Only spawn schema loading once per cluster, even if launch() is called multiple times. let already_started = self .readiness From 4a15fd82b0e05d0eee0b33c8be0842b45a652089 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Tue, 3 Feb 2026 20:37:41 -0800 Subject: [PATCH 13/29] recreate --- pgdog/src/backend/databases.rs | 52 ++++++++-------- .../schema/postgres_fdw/custom_types.rs | 14 ----- .../src/backend/schema/postgres_fdw/schema.rs | 60 +++++++++++++++---- 3 files changed, 74 insertions(+), 52 deletions(-) diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index 23ecf679..1a485c94 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -626,8 +626,8 @@ mod tests { use super::*; use crate::config::{Config, ConfigAndUsers, Database, Role}; - #[test] - fn test_mirror_user_isolation() { + #[tokio::test] + async fn test_mirror_user_isolation() { // Test that each user gets their own mirror cluster let mut config = Config::default(); @@ -707,8 +707,8 @@ mod tests { assert_eq!(bob_mirrors[0].name(), "db1_mirror"); } - #[test] - fn test_mirror_user_mismatch_handling() { + #[tokio::test] + async fn test_mirror_user_mismatch_handling() { // Test that mirroring is disabled gracefully when users don't match let mut config = Config::default(); @@ -783,8 +783,8 @@ mod tests { ); } - #[test] - fn test_precomputed_mirror_configs() { + #[tokio::test] + async fn test_precomputed_mirror_configs() { // Test that mirror configs are precomputed correctly during initialization let mut config = Config::default(); config.general.mirror_queue = 100; @@ -860,8 +860,8 @@ mod tests { ); } - #[test] - fn test_mirror_config_with_global_defaults() { + #[tokio::test] + async fn test_mirror_config_with_global_defaults() { // Test that global defaults are used when mirror-specific values aren't provided let mut config = Config::default(); config.general.mirror_queue = 150; @@ -933,8 +933,8 @@ mod tests { ); } - #[test] - fn test_mirror_config_partial_overrides() { + #[tokio::test] + async fn test_mirror_config_partial_overrides() { // Test that we can override just queue or just exposure let mut config = Config::default(); config.general.mirror_queue = 100; @@ -1033,8 +1033,8 @@ mod tests { ); } - #[test] - fn test_invalid_mirror_not_precomputed() { + #[tokio::test] + async fn test_invalid_mirror_not_precomputed() { // Test that invalid mirror configs (user mismatch) are not precomputed let mut config = Config::default(); @@ -1096,8 +1096,8 @@ mod tests { ); } - #[test] - fn test_mirror_config_no_users() { + #[tokio::test] + async fn test_mirror_config_no_users() { // Test that mirror configs without any users are not precomputed let mut config = Config::default(); config.general.mirror_queue = 100; @@ -1205,8 +1205,8 @@ mod tests { ); } - #[test] - fn test_user_all_databases_creates_pools_for_all_dbs() { + #[tokio::test] + async fn test_user_all_databases_creates_pools_for_all_dbs() { let mut config = Config::default(); config.databases = vec![ @@ -1268,8 +1268,8 @@ mod tests { assert_eq!(databases.all().len(), 3); } - #[test] - fn test_user_multiple_databases_creates_pools_for_specified_dbs() { + #[tokio::test] + async fn test_user_multiple_databases_creates_pools_for_specified_dbs() { let mut config = Config::default(); config.databases = vec![ @@ -1331,8 +1331,8 @@ mod tests { assert_eq!(databases.all().len(), 2); } - #[test] - fn test_all_databases_takes_priority_over_databases_list() { + #[tokio::test] + async fn test_all_databases_takes_priority_over_databases_list() { let mut config = Config::default(); config.databases = vec![ @@ -1413,8 +1413,8 @@ mod tests { ); } - #[test] - fn test_user_with_single_database_creates_one_pool() { + #[tokio::test] + async fn test_user_with_single_database_creates_one_pool() { let mut config = Config::default(); config.databases = vec![ @@ -1463,8 +1463,8 @@ mod tests { assert_eq!(databases.all().len(), 1); } - #[test] - fn test_multiple_users_with_different_database_access() { + #[tokio::test] + async fn test_multiple_users_with_different_database_access() { let mut config = Config::default(); config.databases = vec![ @@ -1541,8 +1541,8 @@ mod tests { assert_eq!(databases.all().len(), 6); } - #[test] - fn test_databases_list_with_nonexistent_database_skipped() { + #[tokio::test] + async fn test_databases_list_with_nonexistent_database_skipped() { let mut config = Config::default(); config.databases = vec![Database { diff --git a/pgdog/src/backend/schema/postgres_fdw/custom_types.rs b/pgdog/src/backend/schema/postgres_fdw/custom_types.rs index 13eab57c..86b1cf3e 100644 --- a/pgdog/src/backend/schema/postgres_fdw/custom_types.rs +++ b/pgdog/src/backend/schema/postgres_fdw/custom_types.rs @@ -1,6 +1,5 @@ //! Custom type definitions (enums, domains, composite types) for foreign tables. -use std::collections::HashSet; use std::fmt::Write; use crate::net::messages::DataRow; @@ -180,21 +179,8 @@ impl CustomTypes { &self, server: &mut crate::backend::Server, ) -> Result<(), crate::backend::Error> { - let mut created_schemas = HashSet::new(); - for custom_type in &self.types { - if !created_schemas.contains(&custom_type.schema_name) { - server - .execute(&format!( - "CREATE SCHEMA IF NOT EXISTS {}", - quote_identifier(&custom_type.schema_name) - )) - .await?; - created_schemas.insert(custom_type.schema_name.clone()); - } - let stmt = custom_type.create_statement()?; - tracing::debug!("[fdw::setup] {} [{}]", stmt, server.addr()); server.execute(&stmt).await?; } diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 2f92a77f..09537a23 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -76,23 +76,17 @@ impl ForeignTableSchema { // Create extensions first (types may depend on them) self.extensions.setup(server).await?; + server.execute("BEGIN").await?; + + // Drop and recreate managed schemas (CASCADE drops tables and types) + self.drop_schemas(server).await?; + self.create_schemas(server).await?; + // Create custom types (enums, domains, composite types) self.custom_types.setup(server).await?; - let mut schemas = HashSet::new(); let mut tables = HashSet::new(); - for ((schema, table), columns) in &self.tables { - if !schemas.contains(schema) { - server - .execute(&format!( - "CREATE SCHEMA IF NOT EXISTS {}", - super::quote_identifier(&schema) - )) - .await?; - schemas.insert(schema.clone()); - } - let dedup = (schema.clone(), table.clone()); if !tables.contains(&dedup) { let statements = create_foreign_table(columns, sharding_schema)?; @@ -103,6 +97,8 @@ impl ForeignTableSchema { tables.insert(dedup); } } + + server.execute("COMMIT").await?; Ok(()) } @@ -115,6 +111,46 @@ impl ForeignTableSchema { pub fn custom_types(&self) -> &CustomTypes { &self.custom_types } + + /// Collect unique schemas from tables and custom types. + fn schemas(&self) -> HashSet { + self.tables + .keys() + .map(|(s, _)| s.clone()) + .chain( + self.custom_types + .types() + .iter() + .map(|t| t.schema_name.clone()), + ) + .collect() + } + + /// Drop schemas we manage (with CASCADE to drop tables and types). + async fn drop_schemas(&self, server: &mut Server) -> Result<(), super::super::Error> { + for schema in &self.schemas() { + server + .execute(&format!( + "DROP SCHEMA IF EXISTS {} CASCADE", + super::quote_identifier(schema) + )) + .await?; + } + Ok(()) + } + + /// Create schemas we manage. + async fn create_schemas(&self, server: &mut Server) -> Result<(), super::super::Error> { + for schema in &self.schemas() { + server + .execute(&format!( + "CREATE SCHEMA {}", + super::quote_identifier(schema) + )) + .await?; + } + Ok(()) + } } impl ForeignTableColumn { From ea108448ec6b2dc54ecdd990cc225609b5632d5a Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Tue, 3 Feb 2026 22:02:08 -0800 Subject: [PATCH 14/29] save --- integration/postgres_fdw/pgdog.toml | 3 + pgdog-config/src/fdw.rs | 16 +- pgdog/src/backend/databases.rs | 3 + pgdog/src/backend/fdw/error.rs | 3 + pgdog/src/backend/fdw/launcher.rs | 434 +++++++++--------- pgdog/src/backend/fdw/postgres.rs | 153 +++--- pgdog/src/backend/schema/postgres_fdw/mod.rs | 2 +- .../src/backend/schema/postgres_fdw/schema.rs | 67 +++ 8 files changed, 363 insertions(+), 318 deletions(-) diff --git a/integration/postgres_fdw/pgdog.toml b/integration/postgres_fdw/pgdog.toml index d9cd431e..a27c2792 100644 --- a/integration/postgres_fdw/pgdog.toml +++ b/integration/postgres_fdw/pgdog.toml @@ -16,3 +16,6 @@ database_name = "shard_1_fdw" [[sharded_tables]] column = "user_id" database = "pgdog" + +[admin] +password = "pgdog" diff --git a/pgdog-config/src/fdw.rs b/pgdog-config/src/fdw.rs index 2f94c8b7..87422e2a 100644 --- a/pgdog-config/src/fdw.rs +++ b/pgdog-config/src/fdw.rs @@ -6,11 +6,8 @@ pub struct Fdw { #[serde(default)] pub enabled: bool, - #[serde(default = "default_green_port")] - pub green_port: u16, - - #[serde(default = "default_blue_port")] - pub blue_port: u16, + #[serde(default = "default_port")] + pub port: u16, #[serde(default = "default_launch_timeout")] pub launch_timeout: u64, @@ -20,21 +17,16 @@ impl Default for Fdw { fn default() -> Self { Self { enabled: bool::default(), - green_port: default_green_port(), - blue_port: default_blue_port(), + port: default_port(), launch_timeout: default_launch_timeout(), } } } -fn default_green_port() -> u16 { +fn default_port() -> u16 { 6433 } -fn default_blue_port() -> u16 { - 6434 -} - fn default_launch_timeout() -> u64 { 5_000 } diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index 1a485c94..f948c4de 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -121,6 +121,9 @@ pub fn reload() -> Result<(), Error> { tls::reload()?; + // Reconfigure FDW with new schema. + PostgresLauncher::get().reconfigure(); + // Remove any unused prepared statements. PreparedStatements::global() .write() diff --git a/pgdog/src/backend/fdw/error.rs b/pgdog/src/backend/fdw/error.rs index 0abc5bdc..66d39e82 100644 --- a/pgdog/src/backend/fdw/error.rs +++ b/pgdog/src/backend/fdw/error.rs @@ -30,4 +30,7 @@ pub enum Error { #[error("error parsing postgres version")] PostgresVersion(#[from] ParseFloatError), + + #[error("postgres process exited unexpectedly")] + ProcessExited, } diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs index 823d3637..7fbc4531 100644 --- a/pgdog/src/backend/fdw/launcher.rs +++ b/pgdog/src/backend/fdw/launcher.rs @@ -1,7 +1,7 @@ use std::{ ops::Deref, sync::{ - atomic::{AtomicBool, AtomicU16, Ordering}, + atomic::{AtomicBool, Ordering}, Arc, }, }; @@ -12,13 +12,25 @@ use super::{Error, PostgresProcess}; use once_cell::sync::Lazy; use tokio::{ select, spawn, - sync::watch, + sync::broadcast, time::{sleep, Duration}, }; -use tracing::{error, info, warn}; +use tracing::{error, info}; static LAUNCHER: Lazy = Lazy::new(PostgresLauncher::new); +#[derive(Clone, Debug, PartialEq)] +pub enum LauncherEvent { + // Commands + Start, + Shutdown, + Reconfigure, + + // Status + Ready, + ShutdownComplete, +} + #[derive(Debug, Clone)] pub struct PostgresLauncher { inner: Arc, @@ -34,41 +46,24 @@ impl Deref for PostgresLauncher { #[derive(Debug)] pub struct Inner { - /// Incremented to trigger restart/shutdown check in the spawn loop. - restart_trigger: watch::Sender, - /// Whether the launcher should be running. - online: AtomicBool, - /// True when shutdown has been requested (used to detect early shutdown). - shutdown_requested: AtomicBool, - /// Current blue/green port. - port: AtomicU16, - /// True when postgres is ready to accept connections. - ready: watch::Sender, - /// True when shutdown is complete. - shutdown_complete: watch::Sender, + events: broadcast::Sender, + ready: AtomicBool, } impl PostgresLauncher { fn new() -> Self { - let fdw = config().config.fdw; - let port = AtomicU16::new(fdw.blue_port); - - let (restart_trigger, _) = watch::channel(0u64); - let (ready, _) = watch::channel(false); - let (shutdown_complete, _) = watch::channel(false); + let (events, _) = broadcast::channel(16); let launcher = Self { inner: Arc::new(Inner { - restart_trigger, - online: AtomicBool::new(false), - shutdown_requested: AtomicBool::new(false), - port, - ready, - shutdown_complete, + events, + ready: AtomicBool::new(false), }), }; - launcher.spawn(); + // Subscribe before spawning to avoid race condition. + let receiver = launcher.events.subscribe(); + launcher.spawn(receiver); launcher } @@ -77,201 +72,145 @@ impl PostgresLauncher { LAUNCHER.clone() } - /// Get current blue/green port. + /// Get configured port. pub(crate) fn port(&self) -> u16 { - self.port.load(Ordering::Relaxed) + config().config.fdw.port } - /// Start the launcher. - /// - /// Idempontent. + /// Start the launcher. Idempotent. pub(crate) fn launch(&self) { - if self.online.load(Ordering::Relaxed) { + let _ = self.events.send(LauncherEvent::Start); + } + + /// Request reconfiguration. + pub(crate) fn reconfigure(&self) { + let _ = self.events.send(LauncherEvent::Reconfigure); + } + + /// Shutdown and wait for completion. + pub(crate) async fn shutdown_wait(&self) { + // Subscribe before sending to avoid race condition. + let receiver = self.events.subscribe(); + let _ = self.events.send(LauncherEvent::Shutdown); + Self::wait_for(receiver, LauncherEvent::ShutdownComplete).await; + } + + /// Wait for Postgres to be ready. + pub(crate) async fn wait_ready(&self) { + // Subscribe first to avoid race with Ready event. + let receiver = self.events.subscribe(); + if self.ready.load(Ordering::Acquire) { return; } + Self::wait_for(receiver, LauncherEvent::Ready).await; + } - self.launch_blue_green(false); + async fn wait_for(mut receiver: broadcast::Receiver, target: LauncherEvent) { + loop { + match receiver.recv().await { + Ok(event) if event == target => return, + Ok(_) => continue, + Err(broadcast::error::RecvError::Closed) => return, + Err(broadcast::error::RecvError::Lagged(_)) => continue, + } + } } - fn spawn(&self) { + fn send(&self, event: LauncherEvent) { + match &event { + LauncherEvent::Ready => self.ready.store(true, Ordering::Release), + LauncherEvent::Start | LauncherEvent::Shutdown => { + self.ready.store(false, Ordering::Release) + } + _ => {} + } + let _ = self.events.send(event); + } + + fn spawn(&self, receiver: broadcast::Receiver) { let launcher = self.clone(); spawn(async move { - let mut restart_receiver = launcher.restart_trigger.subscribe(); + let mut receiver = receiver; loop { - let online = launcher.online.load(Ordering::Relaxed); - - if !online { - // Check if shutdown was already requested before we even started. - if launcher.shutdown_requested.load(Ordering::Acquire) { - launcher.ready.send_modify(|v| *v = true); - launcher.mark_shutdown(); + // Wait for Start or Shutdown. + match receiver.recv().await { + Ok(LauncherEvent::Start) => {} + Ok(LauncherEvent::Shutdown) => { + launcher.send(LauncherEvent::Ready); + launcher.send(LauncherEvent::ShutdownComplete); return; } - - // Wait for trigger (launch or shutdown). - let _ = restart_receiver.changed().await; - - // Re-check if this was a shutdown request. - if launcher.shutdown_requested.load(Ordering::Acquire) { - launcher.ready.send_modify(|v| *v = true); - launcher.mark_shutdown(); - return; - } - } - - info!( - "[fdw] launching fdw backend on 0.0.0.0:{}", - launcher.port.load(Ordering::Relaxed), - ); - - let had_error = launcher.run(&mut restart_receiver).await.is_err(); - if had_error { - error!("[fdw] launcher exited with error"); + Ok(_) => continue, + Err(broadcast::error::RecvError::Closed) => return, + Err(broadcast::error::RecvError::Lagged(_)) => continue, } - let online = launcher.online.load(Ordering::Relaxed); - if !online { - break; - } + info!("[fdw] launching fdw backend on 0.0.0.0:{}", launcher.port()); - // Delay retry only on error to prevent tight loops. - if had_error { - sleep(Duration::from_millis(1000)).await; + match launcher.run(&mut receiver).await { + Ok(()) => return, // Clean shutdown + Err(err) => { + error!("[fdw] launcher error: {}", err); + sleep(Duration::from_millis(1000)).await; + } } } - - // Signal shutdown complete when exiting the loop. - launcher.mark_shutdown(); }); } - pub(crate) fn shutdown(&self) { - self.shutdown_requested.store(true, Ordering::Release); - self.online.store(false, Ordering::Relaxed); - self.ready.send_modify(|v| *v = false); - self.shutdown_complete.send_modify(|v| *v = false); - self.restart_trigger.send_modify(|v| *v = v.wrapping_add(1)); - } - - pub(crate) async fn shutdown_wait(&self) { - self.shutdown(); - self.wait_shutdown().await; - } - - /// Trigger blue/green deployment. - pub(crate) fn launch_blue_green(&self, failover: bool) { - let fdw = config().config.fdw; - let port = self.port.load(Ordering::Relaxed); - - let port = if failover { - if port == fdw.blue_port { - fdw.green_port - } else { - fdw.blue_port - } - } else { - port - }; - - warn!("[fdw] relaunching fdw backend on 0.0.0.0:{}", port); - - self.port.store(port, Ordering::Relaxed); - // Use send_modify to ensure value is updated even without receivers. - self.ready.send_modify(|v| *v = false); - self.shutdown_complete.send_modify(|v| *v = false); - self.online.store(true, Ordering::Relaxed); - self.restart_trigger.send_modify(|v| *v = v.wrapping_add(1)); - } - - /// Wait for Postgres to be ready. - pub(crate) async fn wait_ready(&self) { - let mut receiver = self.ready.subscribe(); - - // Check current state first. - if *receiver.borrow() { - return; - } - - // Wait for ready to become true. - while receiver.changed().await.is_ok() { - if *receiver.borrow() { - return; - } - } - } - - fn mark_ready(&self) { - self.ready.send_modify(|v| *v = true); - } - - async fn run(&self, restart_receiver: &mut watch::Receiver) -> Result<(), Error> { + async fn run(&self, receiver: &mut broadcast::Receiver) -> Result<(), Error> { let port = self.port(); let mut process = PostgresProcess::new(None, port).await?; let mut shutdown_receiver = process.shutdown_receiver(); - // Use a closure to ensure process is stopped on any exit path. - let result = async { - process.launch().await?; - process.wait_ready().await; + process.launch().await?; + process.wait_ready().await; - for cluster in databases().all().values() { - if cluster.shards().len() > 1 { - process.configure(cluster).await?; - } + for cluster in databases().all().values() { + if cluster.shards().len() > 1 { + process.configure(cluster).await?; } + } - self.mark_ready(); + self.send(LauncherEvent::Ready); + loop { select! { - _ = restart_receiver.changed() => { - let online = self.online.load(Ordering::Relaxed); - if online { - process.request_stop(); - } else { - process.stop_wait().await; + event = receiver.recv() => { + match event { + Ok(LauncherEvent::Shutdown) => { + process.stop_wait().await; + self.send(LauncherEvent::ShutdownComplete); + return Ok(()); + } + + Ok(LauncherEvent::Reconfigure) => { + for cluster in databases().all().values() { + if cluster.shards().len() > 1 { + if let Err(err) = process.configure(cluster).await { + error!("[fdw] reconfigure error: {}", err); + } + } + } + } + + Ok(_) => continue, + Err(broadcast::error::RecvError::Closed) => { + process.stop_wait().await; + return Ok(()); + } + Err(broadcast::error::RecvError::Lagged(_)) => continue, } } _ = shutdown_receiver.changed() => { - // Process exited (possibly unexpectedly). - // Clear pid to prevent dirty shutdown warning since process already exited. process.clear_pid(); + self.send(LauncherEvent::ShutdownComplete); + return Err(Error::ProcessExited); } } - - Ok::<(), Error>(()) - } - .await; - - // Ensure process is stopped if we're exiting due to an error. - if result.is_err() { - process.stop_wait().await; - } - - self.mark_shutdown(); - - result - } - - fn mark_shutdown(&self) { - self.shutdown_complete.send_modify(|v| *v = true); - } - - async fn wait_shutdown(&self) { - let mut receiver = self.shutdown_complete.subscribe(); - - // Check current state first. - if *receiver.borrow() { - return; - } - - // Wait for shutdown_complete to become true. - while receiver.changed().await.is_ok() { - if *receiver.borrow() { - return; - } } } } @@ -281,43 +220,41 @@ mod test { use super::*; use crate::backend::{pool::Address, ConnectReason, Server, ServerOptions}; use crate::config::config; + use tokio::time::timeout; - #[tokio::test] - async fn test_postgres_blue_green() { - use tokio::time::timeout; + fn test_launcher() -> PostgresLauncher { + let (events, _) = broadcast::channel(16); + let launcher = PostgresLauncher { + inner: Arc::new(Inner { + events, + ready: AtomicBool::new(false), + }), + }; + let receiver = launcher.events.subscribe(); + launcher.spawn(receiver); + launcher + } + #[tokio::test] + async fn test_postgres_launcher() { crate::logger(); let fdw = config().config.fdw; - let mut address = Address { + let address = Address { host: "127.0.0.1".into(), - port: fdw.blue_port, + port: fdw.port, user: "postgres".into(), database_name: "postgres".into(), ..Default::default() }; let launcher = PostgresLauncher::get(); - launcher.launch_blue_green(false); - - timeout(Duration::from_secs(10), launcher.wait_ready()) - .await - .expect("timeout waiting for first ready"); - - let mut conn = - Server::connect(&address, ServerOptions::default(), ConnectReason::default()) - .await - .unwrap(); - conn.execute("SELECT 1").await.unwrap(); - drop(conn); - - launcher.launch_blue_green(true); + launcher.launch(); timeout(Duration::from_secs(10), launcher.wait_ready()) .await - .expect("timeout waiting for second ready"); + .expect("timeout waiting for ready"); - address.port = fdw.green_port; let mut conn = Server::connect(&address, ServerOptions::default(), ConnectReason::default()) .await @@ -331,34 +268,77 @@ mod test { #[tokio::test] async fn test_shutdown_without_start() { - use tokio::time::timeout; - - // Test that shutdown_wait() works even if FDW was never started. - // This creates a new launcher directly (not the singleton) to avoid - // interference from other tests. - let (restart_trigger, _) = watch::channel(0u64); - let (ready, _) = watch::channel(false); - let (shutdown_complete, _) = watch::channel(false); + let launcher = test_launcher(); - let launcher = PostgresLauncher { - inner: Arc::new(Inner { - restart_trigger, - online: AtomicBool::new(false), - shutdown_requested: AtomicBool::new(false), - port: AtomicU16::new(6433), - ready, - shutdown_complete, - }), - }; - - launcher.spawn(); - - // Give spawn task time to start waiting + // Give spawn task time to start waiting. sleep(Duration::from_millis(10)).await; - // Shutdown without ever starting - should not hang + // Shutdown without ever starting - should not hang. timeout(Duration::from_secs(5), launcher.shutdown_wait()) .await .expect("shutdown_wait() hung when FDW was never started"); } + + #[tokio::test] + async fn test_wait_ready_no_race() { + // Test that wait_ready doesn't miss Ready event due to race condition. + // Run multiple iterations to increase chance of hitting race window. + for _ in 0..100 { + let launcher = test_launcher(); + + // Spawn task that sends Ready immediately. + let launcher_clone = launcher.clone(); + spawn(async move { + launcher_clone.send(LauncherEvent::Ready); + }); + + // wait_ready should not hang even if Ready is sent + // between subscribe and wait. + timeout(Duration::from_millis(100), launcher.wait_ready()) + .await + .expect("wait_ready() missed Ready event - race condition"); + } + } + + #[tokio::test] + async fn test_shutdown_wait_no_race() { + // Test that shutdown_wait doesn't miss ShutdownComplete due to race. + for _ in 0..100 { + let launcher = test_launcher(); + + // Give spawn task time to start. + sleep(Duration::from_millis(1)).await; + + // shutdown_wait sends Shutdown and waits for ShutdownComplete. + // The spawn loop should receive Shutdown and send ShutdownComplete. + // This should not hang even with tight timing. + timeout(Duration::from_millis(100), launcher.shutdown_wait()) + .await + .expect("shutdown_wait() missed ShutdownComplete - race condition"); + } + } + + #[tokio::test] + async fn test_concurrent_wait_ready() { + // Multiple tasks waiting for Ready concurrently. + let launcher = test_launcher(); + + let mut handles = vec![]; + for _ in 0..10 { + let l = launcher.clone(); + handles.push(spawn(async move { + timeout(Duration::from_millis(100), l.wait_ready()) + .await + .expect("concurrent wait_ready timed out"); + })); + } + + // Small delay then send Ready. + sleep(Duration::from_millis(5)).await; + launcher.send(LauncherEvent::Ready); + + for handle in handles { + handle.await.unwrap(); + } + } } diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index d64f7c27..b87e316d 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -28,7 +28,7 @@ use tracing::{error, info, warn}; use crate::backend::{ pool::{Address, Config, PoolConfig, Request}, - schema::postgres_fdw::{quote_identifier, ForeignTableSchema}, + schema::postgres_fdw::{quote_identifier, FdwServerDef, ForeignTableSchema}, Cluster, ConnectReason, Server, ServerOptions, }; @@ -49,7 +49,7 @@ struct PostgresProcessAsync { impl PostgresProcessAsync { /// Stop Postgres and cleanup. async fn stop(&mut self) -> Result<(), Error> { - warn!( + info!( "[fdw] stopping PostgreSQL {} running on 0.0.0.0:{}", self.version, self.port ); @@ -71,16 +71,6 @@ impl PostgresProcessAsync { Ok(()) } - - /// Force stop immediately. - #[allow(dead_code)] - async fn force_stop(&mut self) -> Result<(), Error> { - self.child.kill().await?; - self.child.wait().await?; - remove_dir_all(&self.initdb_dir).await?; - - Ok(()) - } } #[derive(Debug, Clone)] @@ -99,10 +89,11 @@ pub(crate) struct PostgresProcess { shutdown: watch::Sender, shutdown_complete: watch::Sender, port: u16, - databases: HashSet, - users: HashSet, pid: Option, version: f32, + /// Tracks which cluster databases have been fully configured. + /// Subsequent clusters with the same database only get user mappings. + configured_databases: HashSet, } impl PostgresProcess { @@ -125,10 +116,9 @@ impl PostgresProcess { shutdown, shutdown_complete, port, - databases: HashSet::new(), - users: HashSet::new(), pid: None, version: bins.version, + configured_databases: HashSet::new(), }) } @@ -305,7 +295,7 @@ impl PostgresProcess { .collect()) } - async fn setup_databases(&mut self, cluster: &Cluster) -> Result { + async fn setup_databases(&mut self, cluster: &Cluster) -> Result<(), Error> { let hosts: Vec<_> = cluster .shards() .iter() @@ -325,41 +315,55 @@ impl PostgresProcess { } let mut admin_connection = self.admin_connection().await?; - let mut created = false; for backend in Self::pools_to_fdw_backends(cluster, 0)? { - if !self.databases.contains(&backend.database_name) { + let exists: Vec = admin_connection + .fetch_all(&format!( + "SELECT datname FROM pg_database WHERE datname = '{}'", + backend.database_name.replace('\'', "''") + )) + .await?; + + if exists.is_empty() { admin_connection .execute(format!( - r#"CREATE DATABASE {}"#, + "CREATE DATABASE {}", quote_identifier(&backend.database_name) )) .await?; - created = true; } } let user = cluster.identifier().user.clone(); - if !self.users.contains(&user) { + let user_exists: Vec = admin_connection + .fetch_all(&format!( + "SELECT rolname FROM pg_roles WHERE rolname = '{}'", + user.replace('\'', "''") + )) + .await?; + + if user_exists.is_empty() { admin_connection .execute(format!( "CREATE USER {} SUPERUSER LOGIN", - quote_identifier(&cluster.identifier().user) + quote_identifier(&user) )) .await?; - self.users.insert(user); } - Ok(created) + Ok(()) } /// Create the same load-balancing and sharding setup we have in pgdog.toml - /// for this cluster. + /// for this cluster. This function is idempotent. pub(crate) async fn configure(&mut self, cluster: &Cluster) -> Result<(), Error> { - let new_database = self.setup_databases(cluster).await?; + self.setup_databases(cluster).await?; let now = Instant::now(); + let cluster_db = cluster.identifier().database.clone(); + let first_setup = !self.configured_databases.contains(&cluster_db); + info!( "[fdw] setting up database={} user={}", cluster.identifier().database, @@ -368,13 +372,15 @@ impl PostgresProcess { let sharding_schema = cluster.sharding_schema(); - let schema = { + let schema = if first_setup { // TODO: Double check schemas are identical on all shards. let shard = random_range(0..sharding_schema.shards); let mut server = cluster .primary_or_replica(shard, &Request::default()) .await?; - ForeignTableSchema::load(&mut server).await? + Some(ForeignTableSchema::load(&mut server).await?) + } else { + None }; // Setup persistent connections. @@ -390,63 +396,59 @@ impl PostgresProcess { let identifier = (cluster.identifier().user.clone(), database.clone()); let mut connection = self.connection(&identifier.0, database).await?; - if new_database { + if first_setup { + // Create extension in a dedicated schema that won't be dropped. + // This prevents DROP SCHEMA public CASCADE from removing postgres_fdw and its servers. + connection + .execute("CREATE SCHEMA IF NOT EXISTS pgdog_internal") + .await?; connection - .execute("CREATE EXTENSION IF NOT EXISTS postgres_fdw") + .execute("CREATE EXTENSION IF NOT EXISTS postgres_fdw SCHEMA pgdog_internal") .await?; } connections.insert(identifier, connection); } - for (number, _) in cluster.shards().iter().enumerate() { - for backend in Self::pools_to_fdw_backends(cluster, number)? { - let identifier = ( - cluster.identifier().user.clone(), - backend.database_name.clone(), - ); - let connection = connections - .get_mut(&identifier) - .expect("connection is gone"); - - if new_database { - connection - .execute(format!( - r#"CREATE SERVER IF NOT EXISTS "shard_{}" - FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (host '{}', port '{}', dbname '{}')"#, - number, - backend.address.host, - backend.address.port, - backend.address.database_name, - )) - .await?; + // Build server definitions for each database and run setup + let num_pools = Self::pools_to_fdw_backends(cluster, 0)?.len(); + for pool_position in 0..num_pools { + let database = format!("{}_{}", cluster.identifier().database, pool_position); + let identifier = (cluster.identifier().user.clone(), database); + let mut connection = connections + .get_mut(&identifier) + .expect("connection is gone"); + + // Collect server definitions for all shards using this pool position + let mut server_defs = Vec::new(); + for (shard_num, _) in cluster.shards().iter().enumerate() { + let backends = Self::pools_to_fdw_backends(cluster, shard_num)?; + if let Some(backend) = backends.get(pool_position) { + server_defs.push(FdwServerDef { + shard_num, + host: backend.address.host.clone(), + port: backend.address.port, + database_name: backend.address.database_name.clone(), + user: backend.address.user.clone(), + password: backend.address.password.clone(), + mapping_user: cluster.identifier().user.clone(), + }); } + } - connection - .execute(format!( - r#" - CREATE USER MAPPING IF NOT EXISTS - FOR {} - SERVER "shard_{}" - OPTIONS (user '{}', password '{}')"#, - quote_identifier(&identifier.0), - number, - backend.address.user, - backend.address.password - )) + if first_setup { + schema + .as_ref() + .unwrap() + .setup(&mut connection, &sharding_schema, &server_defs) .await?; + } else { + ForeignTableSchema::setup_user_mappings(&mut connection, &server_defs).await?; } } - if new_database { - for database in &databases { - let identifier = (cluster.identifier().user.clone(), database.clone()); - let mut connection = connections - .get_mut(&identifier) - .expect("connection is gone"); - schema.setup(&mut connection, &sharding_schema).await?; - } + if first_setup { + self.configured_databases.insert(cluster_db); } let elapsed = now.elapsed(); @@ -519,11 +521,6 @@ impl PostgresProcess { self.pid.take(); } - pub(crate) fn request_stop(&mut self) { - self.shutdown.send_modify(|v| *v = true); - self.pid.take(); - } - /// Clear the pid to prevent dirty shutdown warning. /// Used when the process has already exited. pub(crate) fn clear_pid(&mut self) { diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs index 2673d0e6..5a2d8d31 100644 --- a/pgdog/src/backend/schema/postgres_fdw/mod.rs +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -9,7 +9,7 @@ mod statement; pub use custom_types::{CustomType, CustomTypeKind, CustomTypes, CUSTOM_TYPES_QUERY}; pub use error::Error; pub use extensions::{Extension, Extensions, EXTENSIONS_QUERY}; -pub use schema::{ForeignTableColumn, ForeignTableSchema, FOREIGN_TABLE_SCHEMA}; +pub use schema::{FdwServerDef, ForeignTableColumn, ForeignTableSchema, FOREIGN_TABLE_SCHEMA}; pub use statement::{create_foreign_table, ForeignTableBuilder, PartitionStrategy}; pub(crate) use statement::quote_identifier; diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 09537a23..ee836d71 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -10,6 +10,19 @@ use crate::{ use super::custom_types::CustomTypes; use super::extensions::Extensions; +use super::quote_identifier; + +/// Server definition for FDW setup. +#[derive(Debug, Clone)] +pub struct FdwServerDef { + pub shard_num: usize, + pub host: String, + pub port: u16, + pub database_name: String, + pub user: String, + pub password: String, + pub mapping_user: String, +} /// Query to fetch table and column information needed for CREATE FOREIGN TABLE statements. pub static FOREIGN_TABLE_SCHEMA: &str = include_str!("postgres_fdw.sql"); @@ -68,10 +81,12 @@ impl ForeignTableSchema { }) } + /// Full setup: creates servers, schemas, types, and tables. pub(crate) async fn setup( &self, server: &mut Server, sharding_schema: &ShardingSchema, + servers: &[FdwServerDef], ) -> Result<(), super::super::Error> { // Create extensions first (types may depend on them) self.extensions.setup(server).await?; @@ -80,6 +95,36 @@ impl ForeignTableSchema { // Drop and recreate managed schemas (CASCADE drops tables and types) self.drop_schemas(server).await?; + + // Drop and recreate servers (must happen after schema drop, before foreign table creation) + for srv in servers { + server + .execute(format!(r#"DROP SERVER IF EXISTS "shard_{}" CASCADE"#, srv.shard_num)) + .await?; + + server + .execute(format!( + r#"CREATE SERVER "shard_{}" + FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (host '{}', port '{}', dbname '{}')"#, + srv.shard_num, srv.host, srv.port, srv.database_name, + )) + .await?; + + server + .execute(format!( + r#"CREATE USER MAPPING + FOR {} + SERVER "shard_{}" + OPTIONS (user '{}', password '{}')"#, + quote_identifier(&srv.mapping_user), + srv.shard_num, + srv.user, + srv.password, + )) + .await?; + } + self.create_schemas(server).await?; // Create custom types (enums, domains, composite types) @@ -102,6 +147,28 @@ impl ForeignTableSchema { Ok(()) } + /// Add user mappings only (for additional users on an already-configured database). + pub(crate) async fn setup_user_mappings( + server: &mut Server, + servers: &[FdwServerDef], + ) -> Result<(), super::super::Error> { + for srv in servers { + server + .execute(format!( + r#"CREATE USER MAPPING IF NOT EXISTS + FOR {} + SERVER "shard_{}" + OPTIONS (user '{}', password '{}')"#, + quote_identifier(&srv.mapping_user), + srv.shard_num, + srv.user, + srv.password, + )) + .await?; + } + Ok(()) + } + /// Get the extensions. pub fn extensions(&self) -> &Extensions { &self.extensions From 36457f083aa6552fbc55ca347a0261815c2b6482 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Tue, 3 Feb 2026 22:04:24 -0800 Subject: [PATCH 15/29] save --- pgdog/src/backend/schema/postgres_fdw/schema.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index ee836d71..4d4ae294 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -99,7 +99,10 @@ impl ForeignTableSchema { // Drop and recreate servers (must happen after schema drop, before foreign table creation) for srv in servers { server - .execute(format!(r#"DROP SERVER IF EXISTS "shard_{}" CASCADE"#, srv.shard_num)) + .execute(format!( + r#"DROP SERVER IF EXISTS "shard_{}" CASCADE"#, + srv.shard_num + )) .await?; server From d6869fd0d85029dd51d0b318743a5b5fa88b7f5b Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Wed, 4 Feb 2026 07:28:07 -0800 Subject: [PATCH 16/29] reconfigure after each reload --- pgdog/src/backend/fdw/launcher.rs | 4 ++++ pgdog/src/backend/fdw/postgres.rs | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs index 7fbc4531..8e538439 100644 --- a/pgdog/src/backend/fdw/launcher.rs +++ b/pgdog/src/backend/fdw/launcher.rs @@ -174,6 +174,8 @@ impl PostgresLauncher { } } + process.configuration_complete(); + self.send(LauncherEvent::Ready); loop { @@ -194,6 +196,8 @@ impl PostgresLauncher { } } } + + process.configuration_complete(); } Ok(_) => continue, diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index b87e316d..8e15bcd3 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -365,9 +365,10 @@ impl PostgresProcess { let first_setup = !self.configured_databases.contains(&cluster_db); info!( - "[fdw] setting up database={} user={}", + "[fdw] setting up database={} user={} initial={}", cluster.identifier().database, cluster.identifier().user, + first_setup, ); let sharding_schema = cluster.sharding_schema(); @@ -463,6 +464,10 @@ impl PostgresProcess { Ok(()) } + pub(crate) fn configuration_complete(&mut self) { + self.configured_databases.clear(); + } + /// Create server connection. pub(crate) async fn admin_connection(&self) -> Result { self.connection("postgres", "postgres").await From fdfb64db10214c4385eea738d38b444cdc5584c0 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Wed, 4 Feb 2026 10:58:15 -0800 Subject: [PATCH 17/29] save --- integration/postgres_fdw/pgdog.toml | 14 +++++++ integration/postgres_fdw/users.toml | 5 +++ pgdog-config/src/fdw.rs | 4 -- pgdog-config/src/general.rs | 14 ++++++- pgdog-config/src/sharding.rs | 38 +++++++++++++++++++ pgdog/src/backend/databases.rs | 2 +- pgdog/src/backend/pool/cluster.rs | 17 ++++++++- .../frontend/client/query_engine/connect.rs | 4 +- 8 files changed, 89 insertions(+), 9 deletions(-) diff --git a/integration/postgres_fdw/pgdog.toml b/integration/postgres_fdw/pgdog.toml index a27c2792..6509c246 100644 --- a/integration/postgres_fdw/pgdog.toml +++ b/integration/postgres_fdw/pgdog.toml @@ -13,6 +13,20 @@ shard = 1 host = "127.0.0.1" database_name = "shard_1_fdw" +[[databases]] +name = "pgdog" +shard = 0 +host = "127.0.0.1" +database_name = "shard_0_fdw" +role = "replica" + +[[databases]] +name = "pgdog" +shard = 1 +host = "127.0.0.1" +database_name = "shard_1_fdw" +role = "replica" + [[sharded_tables]] column = "user_id" database = "pgdog" diff --git a/integration/postgres_fdw/users.toml b/integration/postgres_fdw/users.toml index 539bb183..ebda3e9d 100644 --- a/integration/postgres_fdw/users.toml +++ b/integration/postgres_fdw/users.toml @@ -2,3 +2,8 @@ name = "pgdog" password = "pgdog" database = "pgdog" + +[[users]] +name = "lev" +password = "lev" +database = "pgdog" diff --git a/pgdog-config/src/fdw.rs b/pgdog-config/src/fdw.rs index 87422e2a..27af750e 100644 --- a/pgdog-config/src/fdw.rs +++ b/pgdog-config/src/fdw.rs @@ -3,9 +3,6 @@ use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Copy)] #[serde(deny_unknown_fields)] pub struct Fdw { - #[serde(default)] - pub enabled: bool, - #[serde(default = "default_port")] pub port: u16, @@ -16,7 +13,6 @@ pub struct Fdw { impl Default for Fdw { fn default() -> Self { Self { - enabled: bool::default(), port: default_port(), launch_timeout: default_launch_timeout(), } diff --git a/pgdog-config/src/general.rs b/pgdog-config/src/general.rs index 23f7ebb6..9787f76c 100644 --- a/pgdog-config/src/general.rs +++ b/pgdog-config/src/general.rs @@ -5,7 +5,11 @@ use std::path::PathBuf; use std::time::Duration; use crate::pooling::ConnectionRecovery; -use crate::{CopyFormat, LoadSchema, QueryParserEngine, QueryParserLevel, SystemCatalogsBehavior}; + +use crate::{ + CopyFormat, CrossShardBackend, LoadSchema, QueryParserEngine, QueryParserLevel, + SystemCatalogsBehavior, +}; use super::auth::{AuthType, PassthoughAuth}; use super::database::{LoadBalancingStrategy, ReadWriteSplit, ReadWriteStrategy}; @@ -212,6 +216,9 @@ pub struct General { /// Load database schema. #[serde(default = "General::load_schema")] pub load_schema: LoadSchema, + /// Cross-shard backend. + #[serde(default = "General::cross_shard_backend")] + pub cross_shard_backend: CrossShardBackend, } impl Default for General { @@ -286,6 +293,7 @@ impl Default for General { resharding_copy_format: CopyFormat::default(), reload_schema_on_ddl: Self::reload_schema_on_ddl(), load_schema: Self::load_schema(), + cross_shard_backend: Self::cross_shard_backend(), } } } @@ -414,6 +422,10 @@ impl General { ) } + fn cross_shard_backend() -> CrossShardBackend { + Self::env_enum_or_default("PGDOG_CROSS_SHARD_BACKEND") + } + pub fn query_timeout(&self) -> Duration { Duration::from_millis(self.query_timeout) } diff --git a/pgdog-config/src/sharding.rs b/pgdog-config/src/sharding.rs index 60eae4ac..927536a5 100644 --- a/pgdog-config/src/sharding.rs +++ b/pgdog-config/src/sharding.rs @@ -400,6 +400,44 @@ impl FromStr for LoadSchema { } } +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, Hash, Default)] +#[serde(rename_all = "snake_case", deny_unknown_fields)] +pub enum CrossShardBackend { + #[default] + PgDog, + Fdw, + Hybrid, +} + +impl CrossShardBackend { + pub fn need_fdw(&self) -> bool { + matches!(self, Self::Fdw | Self::Hybrid) + } +} + +impl Display for CrossShardBackend { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::PgDog => write!(f, "pgdog"), + Self::Fdw => write!(f, "fdw"), + Self::Hybrid => write!(f, "hybrid"), + } + } +} + +impl FromStr for CrossShardBackend { + type Err = (); + + fn from_str(s: &str) -> Result { + match s { + "pgdog" => Ok(Self::PgDog), + "fdw" => Ok(Self::Fdw), + "hybrid" => Ok(Self::Hybrid), + _ => Err(()), + } + } +} + #[cfg(test)] mod test { use super::*; diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index f948c4de..b9bae901 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -91,7 +91,7 @@ pub fn init() -> Result<(), Error> { let config = config(); // Start postgres_fdw compatibility engine. - if config.config.fdw.enabled { + if config.config.general.cross_shard_backend.need_fdw() { PostgresLauncher::get().launch(); } diff --git a/pgdog/src/backend/pool/cluster.rs b/pgdog/src/backend/pool/cluster.rs index 1ff178aa..55a7d9ee 100644 --- a/pgdog/src/backend/pool/cluster.rs +++ b/pgdog/src/backend/pool/cluster.rs @@ -2,7 +2,9 @@ use parking_lot::Mutex; use pgdog_config::{ - LoadSchema, PreparedStatements, QueryParserEngine, QueryParserLevel, Rewrite, RewriteMode, + CrossShardBackend, LoadSchema, PreparedStatements, PreparedStatements, QueryParserEngine, + QueryParserEngine, QueryParserLevel, QueryParserLevel, Rewrite, Rewrite, RewriteMode, + RewriteMode, }; use std::{ sync::{ @@ -82,6 +84,7 @@ pub struct Cluster { lb_strategy: LoadBalancingStrategy, rw_split: ReadWriteSplit, fdw_lb: Option, + cross_shard_backend: CrossShardBackend, } /// Sharding configuration from the cluster. @@ -157,6 +160,7 @@ pub struct ClusterConfig<'a> { pub lsn_check_interval: Duration, pub reload_schema_on_ddl: bool, pub load_schema: LoadSchema, + pub cross_shard_backend: CrossShardBackend, } impl<'a> ClusterConfig<'a> { @@ -207,6 +211,7 @@ impl<'a> ClusterConfig<'a> { lsn_check_interval: Duration::from_millis(general.lsn_check_interval), reload_schema_on_ddl: general.reload_schema_on_ddl, load_schema: general.load_schema, + cross_shard_backend: general.cross_shard_backend, } } } @@ -243,6 +248,7 @@ impl Cluster { query_parser_engine, reload_schema_on_ddl, load_schema, + cross_shard_backend, } = config; let identifier = Arc::new(DatabaseUser { @@ -294,9 +300,12 @@ impl Cluster { lb_strategy, rw_split, fdw_lb: None, + cross_shard_backend, }; - cluster.fdw_lb = FdwLoadBalancer::new(&cluster).ok(); + if cross_shard_backend.need_fdw() { + cluster.fdw_lb = FdwLoadBalancer::new(&cluster).ok(); + } cluster } @@ -482,6 +491,10 @@ impl Cluster { true } + pub fn cross_shard_backend(&self) -> CrossShardBackend { + self.cross_shard_backend + } + /// This database/user pair is responsible for schema management. pub fn schema_admin(&self) -> bool { self.schema_admin diff --git a/pgdog/src/frontend/client/query_engine/connect.rs b/pgdog/src/frontend/client/query_engine/connect.rs index 2b5e2466..cff45aa6 100644 --- a/pgdog/src/frontend/client/query_engine/connect.rs +++ b/pgdog/src/frontend/client/query_engine/connect.rs @@ -30,7 +30,9 @@ impl QueryEngine { } let connect_route = connect_route.unwrap_or(context.client_request.route()); - let connect_route = if context.params.is_postgres_fdw() { + let connect_route = if (context.params.is_postgres_fdw() || connect_route.is_cross_shard()) + && self.backend.cluster()?.cross_shard_backend().need_fdw() + { lazy_static! { static ref FDW_ROUTE: Route = Route::fdw_fallback(); } From 140405b42401d78ba3c555043d9b6dcec3bb92d5 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Wed, 4 Feb 2026 15:42:57 -0800 Subject: [PATCH 18/29] save --- integration/pgdog.toml | 3 +- integration/postgres_fdw/pgdog.toml | 5 +- pgdog/src/admin/set.rs | 4 + pgdog/src/backend/databases.rs | 25 ++++- pgdog/src/backend/fdw/launcher.rs | 4 + pgdog/src/backend/fdw/postgres.rs | 67 +++++++++++- pgdog/src/backend/pool/cluster.rs | 7 +- pgdog/src/backend/schema/postgres_fdw/mod.rs | 5 +- .../schema/postgres_fdw/postgres_fdw.sql | 1 + .../src/backend/schema/postgres_fdw/schema.rs | 33 +++++- .../backend/schema/postgres_fdw/statement.rs | 103 +++++++++++++++--- .../frontend/client/query_engine/connect.rs | 17 ++- pgdog/src/frontend/router/parser/query/ddl.rs | 4 +- pgdog/src/frontend/router/parser/route.rs | 14 +++ 14 files changed, 260 insertions(+), 32 deletions(-) diff --git a/integration/pgdog.toml b/integration/pgdog.toml index 041d6bb2..f090f005 100644 --- a/integration/pgdog.toml +++ b/integration/pgdog.toml @@ -22,7 +22,8 @@ tls_certificate = "integration/tls/cert.pem" tls_private_key = "integration/tls/key.pem" query_parser_engine = "pg_query_raw" system_catalogs = "omnisharded_sticky" -reload_schema_on_ddl = false +reload_schema_on_ddl = true +cross_shard_backend = "fdw" [memory] net_buffer = 8096 diff --git a/integration/postgres_fdw/pgdog.toml b/integration/postgres_fdw/pgdog.toml index 6509c246..4f1fb5ad 100644 --- a/integration/postgres_fdw/pgdog.toml +++ b/integration/postgres_fdw/pgdog.toml @@ -1,5 +1,6 @@ -[fdw] -enabled = true + +[general] +cross_shard_backend = "fdw" [[databases]] name = "pgdog" diff --git a/pgdog/src/admin/set.rs b/pgdog/src/admin/set.rs index af180ec2..fd2fbd7b 100644 --- a/pgdog/src/admin/set.rs +++ b/pgdog/src/admin/set.rs @@ -180,6 +180,10 @@ impl Command for Set { config.config.general.connect_timeout = self.value.parse()?; } + "cross_shard_backend" => { + config.config.general.cross_shard_backend = Self::from_json(&self.value)?; + } + _ => return Err(Error::Syntax), } diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index b9bae901..d20d6296 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -83,6 +83,12 @@ pub fn reload_from_existing() -> Result<(), Error> { let databases = from_config(&config); replace_databases(databases, true)?; + + // Reconfigure FDW with new schema. + if config.config.general.cross_shard_backend.need_fdw() { + PostgresLauncher::get().reconfigure(); + } + Ok(()) } @@ -122,7 +128,24 @@ pub fn reload() -> Result<(), Error> { tls::reload()?; // Reconfigure FDW with new schema. - PostgresLauncher::get().reconfigure(); + match ( + old_config.config.general.cross_shard_backend.need_fdw(), + new_config.config.general.cross_shard_backend.need_fdw(), + ) { + (true, true) => { + PostgresLauncher::get().reconfigure(); + } + + (false, true) => { + PostgresLauncher::get().launch(); + } + + (true, false) => { + PostgresLauncher::get().shutdown(); + } + + (false, false) => {} + } // Remove any unused prepared statements. PreparedStatements::global() diff --git a/pgdog/src/backend/fdw/launcher.rs b/pgdog/src/backend/fdw/launcher.rs index 8e538439..1332af9c 100644 --- a/pgdog/src/backend/fdw/launcher.rs +++ b/pgdog/src/backend/fdw/launcher.rs @@ -82,6 +82,10 @@ impl PostgresLauncher { let _ = self.events.send(LauncherEvent::Start); } + pub(crate) fn shutdown(&self) { + let _ = self.events.send(LauncherEvent::Shutdown); + } + /// Request reconfiguration. pub(crate) fn reconfigure(&self) { let _ = self.events.send(LauncherEvent::Reconfigure); diff --git a/pgdog/src/backend/fdw/postgres.rs b/pgdog/src/backend/fdw/postgres.rs index 8e15bcd3..c5834822 100644 --- a/pgdog/src/backend/fdw/postgres.rs +++ b/pgdog/src/backend/fdw/postgres.rs @@ -122,8 +122,41 @@ impl PostgresProcess { }) } + /// Kill any existing process listening on the given port. + /// This handles orphaned postgres processes from previous crashes. + #[cfg(unix)] + async fn kill_existing_on_port(port: u16) { + // Use fuser to find and kill any process on the port + let result = Command::new("fuser") + .arg("-k") + .arg(format!("{}/tcp", port)) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .await; + + if let Ok(status) = result { + if status.success() { + warn!( + "[fdw] killed orphaned process on port {} from previous run", + port + ); + // Give it a moment to fully terminate + sleep(Duration::from_millis(100)).await; + } + } + } + + #[cfg(not(unix))] + async fn kill_existing_on_port(_port: u16) { + // Not implemented for non-unix platforms + } + /// Setup and launch Postgres process. pub(crate) async fn launch(&mut self) -> Result<(), Error> { + // Clean up any orphaned postgres from previous crashes + Self::kill_existing_on_port(self.port).await; + info!( "[fdw] initializing \"{}\" (PostgreSQL {})", self.initdb_dir.display(), @@ -168,6 +201,22 @@ impl PostgresProcess { #[cfg(unix)] cmd.process_group(0); // Prevent sigint from terminal. + // SAFETY: prctl(PR_SET_PDEATHSIG) is async-signal-safe and doesn't + // access any shared state. It tells the kernel to send SIGKILL to + // this process when its parent dies, preventing orphaned processes. + #[cfg(target_os = "linux")] + { + #[allow(unused_imports)] + use std::os::unix::process::CommandExt; + unsafe { + cmd.pre_exec(|| { + const PR_SET_PDEATHSIG: nix::libc::c_int = 1; + nix::libc::prctl(PR_SET_PDEATHSIG, nix::libc::SIGKILL); + Ok(()) + }); + } + } + let child = cmd.spawn()?; self.pid = child.id().map(|pid| pid as i32); @@ -212,8 +261,22 @@ impl PostgresProcess { } } - _ = process.child.wait() => { - error!("[fdw] postgres shut down unexpectedly"); + exit_status = process.child.wait() => { + // Drain remaining stderr before reporting shutdown + loop { + let mut remaining = String::new(); + match reader.read_line(&mut remaining).await { + Ok(0) => break, // EOF + Ok(_) => { + if !remaining.is_empty() { + let remaining = LOG_PREFIX.replace(&remaining, ""); + info!("[fdw::subprocess] {}", remaining.trim()); + } + } + Err(_) => break, + } + } + error!("[fdw] postgres shut down unexpectedly, exit status: {:?}", exit_status); break; } diff --git a/pgdog/src/backend/pool/cluster.rs b/pgdog/src/backend/pool/cluster.rs index 55a7d9ee..cab6932f 100644 --- a/pgdog/src/backend/pool/cluster.rs +++ b/pgdog/src/backend/pool/cluster.rs @@ -2,9 +2,8 @@ use parking_lot::Mutex; use pgdog_config::{ - CrossShardBackend, LoadSchema, PreparedStatements, PreparedStatements, QueryParserEngine, - QueryParserEngine, QueryParserLevel, QueryParserLevel, Rewrite, Rewrite, RewriteMode, - RewriteMode, + CrossShardBackend, LoadSchema, PreparedStatements, QueryParserEngine, QueryParserLevel, + Rewrite, RewriteMode, }; use std::{ sync::{ @@ -303,7 +302,7 @@ impl Cluster { cross_shard_backend, }; - if cross_shard_backend.need_fdw() { + if cross_shard_backend.need_fdw() && cluster.shards().len() > 1 { cluster.fdw_lb = FdwLoadBalancer::new(&cluster).ok(); } diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs index 5a2d8d31..d5ccb3d9 100644 --- a/pgdog/src/backend/schema/postgres_fdw/mod.rs +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -10,6 +10,9 @@ pub use custom_types::{CustomType, CustomTypeKind, CustomTypes, CUSTOM_TYPES_QUE pub use error::Error; pub use extensions::{Extension, Extensions, EXTENSIONS_QUERY}; pub use schema::{FdwServerDef, ForeignTableColumn, ForeignTableSchema, FOREIGN_TABLE_SCHEMA}; -pub use statement::{create_foreign_table, ForeignTableBuilder, PartitionStrategy}; +pub use statement::{ + create_foreign_table, CreateForeignTableResult, ForeignTableBuilder, PartitionStrategy, + TypeMismatch, +}; pub(crate) use statement::quote_identifier; diff --git a/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql b/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql index a4555468..16ef63e5 100644 --- a/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql +++ b/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql @@ -27,6 +27,7 @@ WHERE AND n.nspname <> 'pg_catalog' AND n.nspname !~ '^pg_toast' AND n.nspname <> 'information_schema' + AND NOT (n.nspname = 'pgdog' AND c.relname IN ('validator_bigint', 'validator_uuid', 'config')) AND NOT c.relispartition ORDER BY n.nspname, diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 4d4ae294..6a117b1b 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -1,7 +1,7 @@ //! Foreign table schema query and data structures. use std::collections::{HashMap, HashSet}; -use tracing::debug; +use tracing::{debug, warn}; use crate::{ backend::{schema::postgres_fdw::create_foreign_table, Server, ShardingSchema}, @@ -11,6 +11,7 @@ use crate::{ use super::custom_types::CustomTypes; use super::extensions::Extensions; use super::quote_identifier; +use super::TypeMismatch; /// Server definition for FDW setup. #[derive(Debug, Clone)] @@ -134,18 +135,37 @@ impl ForeignTableSchema { self.custom_types.setup(server).await?; let mut tables = HashSet::new(); + let mut all_type_mismatches: Vec = Vec::new(); + for ((schema, table), columns) in &self.tables { + // Skip internal PgDog tables + if Self::is_internal_table(schema, table) { + continue; + } + let dedup = (schema.clone(), table.clone()); if !tables.contains(&dedup) { - let statements = create_foreign_table(columns, sharding_schema)?; - for sql in statements { + let result = create_foreign_table(columns, sharding_schema)?; + for sql in &result.statements { debug!("[fdw::setup] {} [{}]", sql, server.addr()); - server.execute(&sql).await?; + server.execute(sql).await?; } + all_type_mismatches.extend(result.type_mismatches); tables.insert(dedup); } } + // Log summary of type mismatches if any were found + if !all_type_mismatches.is_empty() { + warn!( + "[fdw] {} table(s) skipped due to sharding config type mismatches:", + all_type_mismatches.len() + ); + for mismatch in &all_type_mismatches { + warn!("[fdw] - {}", mismatch); + } + } + server.execute("COMMIT").await?; Ok(()) } @@ -182,6 +202,11 @@ impl ForeignTableSchema { &self.custom_types } + /// Check if a table is an internal PgDog table that shouldn't be exposed via FDW. + fn is_internal_table(schema: &str, table: &str) -> bool { + schema == "pgdog" && matches!(table, "validator_bigint" | "validator_uuid" | "config") + } + /// Collect unique schemas from tables and custom types. fn schemas(&self) -> HashSet { self.tables diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs index 77117c6f..752db0ad 100644 --- a/pgdog/src/backend/schema/postgres_fdw/statement.rs +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -5,12 +5,42 @@ use std::fmt::Write; use rand::Rng; use crate::backend::pool::ShardingSchema; -use crate::config::{FlexibleType, ShardedTable}; +use crate::config::{DataType, FlexibleType, ShardedTable}; use crate::frontend::router::parser::Column; use crate::frontend::router::sharding::Mapping; use super::{Error, ForeignTableColumn}; +/// A type mismatch between a table column and the configured sharding data type. +#[derive(Debug, Clone)] +pub struct TypeMismatch { + pub schema_name: String, + pub table_name: String, + pub column_name: String, + pub column_type: String, + pub configured_type: DataType, +} + +impl std::fmt::Display for TypeMismatch { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}.{}.{}: column type '{}' does not match configured data_type '{:?}'", + self.schema_name, + self.table_name, + self.column_name, + self.column_type, + self.configured_type + ) + } +} + +/// Result of creating foreign table statements. +pub struct CreateForeignTableResult { + pub statements: Vec, + pub type_mismatches: Vec, +} + /// Format a FlexibleType as a SQL literal. fn flexible_type_to_sql(value: &FlexibleType) -> String { match value { @@ -20,6 +50,31 @@ fn flexible_type_to_sql(value: &FlexibleType) -> String { } } +/// Check if a PostgreSQL column type string matches the configured DataType. +fn column_type_matches_data_type(column_type: &str, data_type: DataType) -> bool { + let col_lower = column_type.to_lowercase(); + match data_type { + DataType::Bigint => { + col_lower.starts_with("bigint") + || col_lower.starts_with("int8") + || col_lower.starts_with("bigserial") + || col_lower.starts_with("serial8") + || col_lower.starts_with("integer") + || col_lower.starts_with("int4") + || col_lower.starts_with("int") + || col_lower.starts_with("smallint") + || col_lower.starts_with("int2") + } + DataType::Uuid => col_lower.starts_with("uuid"), + DataType::Varchar => { + col_lower.starts_with("character varying") + || col_lower.starts_with("varchar") + || col_lower.starts_with("text") + } + DataType::Vector => col_lower.starts_with("vector"), + } +} + /// Partition strategy for a sharded table. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum PartitionStrategy { @@ -78,6 +133,7 @@ fn qualified_table(schema: &str, table: &str) -> String { pub struct ForeignTableBuilder<'a> { columns: &'a [ForeignTableColumn], sharding_schema: &'a ShardingSchema, + type_mismatches: Vec, } impl<'a> ForeignTableBuilder<'a> { @@ -86,11 +142,13 @@ impl<'a> ForeignTableBuilder<'a> { Self { columns, sharding_schema, + type_mismatches: Vec::new(), } } /// Find the sharding configuration for this table, if any. - fn find_sharded_config(&self) -> Option<&ShardedTable> { + /// Records any type mismatches encountered and returns a cloned config. + fn find_sharded_config(&mut self) -> Option { let first = self.columns.first()?; let table_name = &first.table_name; let schema_name = &first.schema_name; @@ -103,7 +161,18 @@ impl<'a> ForeignTableBuilder<'a> { }; if let Some(sharded) = self.sharding_schema.tables().get_table(column) { - return Some(sharded); + if !column_type_matches_data_type(&col.column_type, sharded.data_type) { + let mismatch = TypeMismatch { + schema_name: schema_name.clone(), + table_name: table_name.clone(), + column_name: col.column_name.clone(), + column_type: col.column_type.clone(), + configured_type: sharded.data_type, + }; + self.type_mismatches.push(mismatch); + continue; + } + return Some(sharded.clone()); } } @@ -151,16 +220,24 @@ impl<'a> ForeignTableBuilder<'a> { } /// Build the CREATE TABLE / CREATE FOREIGN TABLE statement(s). - pub fn build(self) -> Result, Error> { + pub fn build(mut self) -> Result { let first = self.columns.first().ok_or(Error::NoColumns)?; - let schema_name = &first.schema_name; - let table_name = &first.table_name; - - if let Some(sharded) = self.find_sharded_config() { - self.build_sharded(table_name, schema_name, sharded) + let schema_name = &first.schema_name.clone(); + let table_name = &first.table_name.clone(); + + let statements = if let Some(sharded) = self.find_sharded_config() { + self.build_sharded(table_name, schema_name, &sharded)? + } else if !self.type_mismatches.is_empty() { + // Skip tables with type mismatches entirely + vec![] } else { - self.build_foreign_table(table_name, schema_name) - } + self.build_foreign_table(table_name, schema_name)? + }; + + Ok(CreateForeignTableResult { + statements, + type_mismatches: self.type_mismatches, + }) } /// Build a simple foreign table (non-sharded). @@ -328,11 +405,11 @@ impl<'a> ForeignTableBuilder<'a> { /// configuration, creates a partitioned parent table with foreign table partitions /// for each shard. Server names are generated as `shard_{n}`. /// -/// Returns a list of SQL statements to execute in order. +/// Returns statements and any type mismatches encountered. pub fn create_foreign_table( columns: &[ForeignTableColumn], sharding_schema: &ShardingSchema, -) -> Result, Error> { +) -> Result { ForeignTableBuilder::new(columns, sharding_schema).build() } diff --git a/pgdog/src/frontend/client/query_engine/connect.rs b/pgdog/src/frontend/client/query_engine/connect.rs index cff45aa6..6d35f8ed 100644 --- a/pgdog/src/frontend/client/query_engine/connect.rs +++ b/pgdog/src/frontend/client/query_engine/connect.rs @@ -30,9 +30,20 @@ impl QueryEngine { } let connect_route = connect_route.unwrap_or(context.client_request.route()); - let connect_route = if (context.params.is_postgres_fdw() || connect_route.is_cross_shard()) - && self.backend.cluster()?.cross_shard_backend().need_fdw() - { + + // Use fdw backend if: + // + // 1. The client asked via SET pgdog.backend and the query is NOT DDL or + // 2. The query is cross-shard && NOT DDL and + // 3. FDW is enabled + // + let use_fdw = self.backend.cluster()?.cross_shard_backend().need_fdw() + && (context.params.is_postgres_fdw() && !connect_route.is_ddl() + || connect_route.use_fdw()); + + debug!("using fdw fallback: {}", use_fdw); + + let connect_route = if use_fdw { lazy_static! { static ref FDW_ROUTE: Route = Route::fdw_fallback(); } diff --git a/pgdog/src/frontend/router/parser/query/ddl.rs b/pgdog/src/frontend/router/parser/query/ddl.rs index c3c2a29c..cd3c4cad 100644 --- a/pgdog/src/frontend/router/parser/query/ddl.rs +++ b/pgdog/src/frontend/router/parser/query/ddl.rs @@ -225,7 +225,9 @@ impl QueryParser { calculator.push(ShardWithPriority::new_table(shard)); Ok(Command::Query( - Route::write(calculator.shard()).with_schema_changed(schema_changed), + Route::write(calculator.shard()) + .with_schema_changed(schema_changed) + .with_ddl(true), )) } diff --git a/pgdog/src/frontend/router/parser/route.rs b/pgdog/src/frontend/router/parser/route.rs index f1de4b3f..8f4cfe73 100644 --- a/pgdog/src/frontend/router/parser/route.rs +++ b/pgdog/src/frontend/router/parser/route.rs @@ -91,6 +91,7 @@ pub struct Route { search_path_driven: bool, schema_changed: bool, fdw_fallback: bool, + ddl: bool, } impl Display for Route { @@ -192,6 +193,10 @@ impl Route { self.is_all_shards() || self.is_multi_shard() } + pub fn use_fdw(&self) -> bool { + self.is_cross_shard() && !self.is_ddl() + } + pub fn order_by(&self) -> &[OrderBy] { &self.order_by } @@ -226,6 +231,15 @@ impl Route { self } + pub fn with_ddl(mut self, ddl: bool) -> Self { + self.ddl = ddl; + self + } + + pub fn is_ddl(&self) -> bool { + self.ddl + } + pub fn set_search_path_driven_mut(&mut self, schema_driven: bool) { self.search_path_driven = schema_driven; } From 50b5b53e2f8e4582e816a803960921b4feb007f7 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 5 Feb 2026 09:54:19 -0800 Subject: [PATCH 19/29] fix tests --- .../backend/schema/postgres_fdw/statement.rs | 56 ++++++++++--------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs index 752db0ad..5d67ed3a 100644 --- a/pgdog/src/backend/schema/postgres_fdw/statement.rs +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -455,9 +455,11 @@ mod test { table: &str, column: &str, mapping: Mapping, + data_type: DataType, ) -> ShardedTable { ShardedTable { mapping: Some(mapping), + data_type, ..test_sharded_table(table, column) } } @@ -518,8 +520,8 @@ mod test { let schema = sharding_schema_with_tables(ShardedTables::default(), 1); let statements = create_foreign_table(&columns, &schema).unwrap(); - assert_eq!(statements.len(), 1); - let sql = &statements[0]; + assert_eq!(statements.statements.len(), 1); + let sql = &statements.statements[0]; assert!(sql.contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); assert!(sql.contains("bigint")); assert!(sql.contains("NOT NULL")); @@ -546,19 +548,19 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); - assert_eq!(statements.len(), 3); // parent + 2 partitions - assert!(statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); - assert!(statements[0].contains(r#"PARTITION BY HASH ("id")"#)); - assert!(statements[1].contains( + assert_eq!(statements.statements.len(), 3); // parent + 2 partitions + assert!(statements.statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); + assert!(statements.statements[0].contains(r#"PARTITION BY HASH ("id")"#)); + assert!(statements.statements[1].contains( r#"CREATE FOREIGN TABLE "public"."test_table_shard_0" PARTITION OF "public"."test_table""# )); - assert!(statements[1].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 0)")); - assert!(statements[1].contains(r#"SERVER "shard_0""#)); - assert!(statements[2].contains( + assert!(statements.statements[1].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 0)")); + assert!(statements.statements[1].contains(r#"SERVER "shard_0""#)); + assert!(statements.statements[2].contains( r#"CREATE FOREIGN TABLE "public"."test_table_shard_1" PARTITION OF "public"."test_table""# )); - assert!(statements[2].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 1)")); - assert!(statements[2].contains(r#"SERVER "shard_1""#)); + assert!(statements.statements[2].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 1)")); + assert!(statements.statements[2].contains(r#"SERVER "shard_1""#)); } #[test] @@ -569,6 +571,7 @@ mod test { "test_table", "region", list_mapping(), + DataType::Varchar, )] .as_slice() .into(); @@ -576,8 +579,8 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); - assert!(statements[0].contains(r#"PARTITION BY LIST ("region")"#)); + assert!(statements.statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); + assert!(statements.statements[0].contains(r#"PARTITION BY LIST ("region")"#)); } #[test] @@ -588,6 +591,7 @@ mod test { "test_table", "id", range_mapping(), + DataType::Bigint, )] .as_slice() .into(); @@ -595,8 +599,8 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); - assert!(statements[0].contains(r#"PARTITION BY RANGE ("id")"#)); + assert!(statements.statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); + assert!(statements.statements[0].contains(r#"PARTITION BY RANGE ("id")"#)); } #[test] @@ -610,9 +614,9 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); - assert_eq!(statements.len(), 1); - assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); - assert!(!statements[0].contains("PARTITION BY")); + assert_eq!(statements.statements.len(), 1); + assert!(statements.statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); + assert!(!statements.statements[0].contains("PARTITION BY")); } #[test] @@ -626,9 +630,9 @@ mod test { let statements = create_foreign_table(&columns, &schema).unwrap(); - assert_eq!(statements.len(), 1); - assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); - assert!(!statements[0].contains("PARTITION BY")); + assert_eq!(statements.statements.len(), 1); + assert!(statements.statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); + assert!(!statements.statements[0].contains("PARTITION BY")); } #[test] @@ -642,10 +646,10 @@ mod test { let schema = sharding_schema_with_tables(ShardedTables::default(), 1); let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); + assert!(statements.statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); // Defaults and generated columns handled by remote table - assert!(!statements[0].contains("GENERATED")); - assert!(!statements[0].contains("DEFAULT")); + assert!(!statements.statements[0].contains("GENERATED")); + assert!(!statements.statements[0].contains("DEFAULT")); } #[test] @@ -659,8 +663,8 @@ mod test { let schema = sharding_schema_with_tables(ShardedTables::default(), 1); let statements = create_foreign_table(&columns, &schema).unwrap(); - assert!(statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); - assert!(statements[0].contains(r#"COLLATE "pg_catalog"."en_US""#)); + assert!(statements.statements[0].contains(r#"CREATE FOREIGN TABLE "public"."test_table""#)); + assert!(statements.statements[0].contains(r#"COLLATE "pg_catalog"."en_US""#)); } #[test] From de15859ab478b85f13a7b2cd32d0df2bdcd8e615 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 5 Feb 2026 10:46:02 -0800 Subject: [PATCH 20/29] tests --- pgdog/src/backend/schema/postgres_fdw/mod.rs | 7 +- .../schema/postgres_fdw/postgres_fdw.sql | 14 +- .../src/backend/schema/postgres_fdw/schema.rs | 130 ++++---- .../backend/schema/postgres_fdw/statement.rs | 296 ++++++++++++++++- .../schema/postgres_fdw/test/helpers.rs | 280 ++++++++++++++++ .../backend/schema/postgres_fdw/test/mod.rs | 309 ++++++++++++++++++ .../frontend/client/query_engine/connect.rs | 24 +- pgdog/src/frontend/router/parser/query/mod.rs | 10 +- 8 files changed, 983 insertions(+), 87 deletions(-) create mode 100644 pgdog/src/backend/schema/postgres_fdw/test/helpers.rs create mode 100644 pgdog/src/backend/schema/postgres_fdw/test/mod.rs diff --git a/pgdog/src/backend/schema/postgres_fdw/mod.rs b/pgdog/src/backend/schema/postgres_fdw/mod.rs index d5ccb3d9..b2f6957e 100644 --- a/pgdog/src/backend/schema/postgres_fdw/mod.rs +++ b/pgdog/src/backend/schema/postgres_fdw/mod.rs @@ -6,13 +6,16 @@ mod extensions; mod schema; mod statement; +#[cfg(test)] +mod test; + pub use custom_types::{CustomType, CustomTypeKind, CustomTypes, CUSTOM_TYPES_QUERY}; pub use error::Error; pub use extensions::{Extension, Extensions, EXTENSIONS_QUERY}; pub use schema::{FdwServerDef, ForeignTableColumn, ForeignTableSchema, FOREIGN_TABLE_SCHEMA}; pub use statement::{ - create_foreign_table, CreateForeignTableResult, ForeignTableBuilder, PartitionStrategy, - TypeMismatch, + create_foreign_table, create_foreign_table_with_children, CreateForeignTableResult, + ForeignTableBuilder, PartitionStrategy, TypeMismatch, }; pub(crate) use statement::quote_identifier; diff --git a/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql b/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql index 16ef63e5..d010ceaf 100644 --- a/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql +++ b/pgdog/src/backend/schema/postgres_fdw/postgres_fdw.sql @@ -7,7 +7,12 @@ SELECT pg_catalog.pg_get_expr(ad.adbin, ad.adrelid)::text AS column_default, a.attgenerated::text AS generated, coll.collname::text AS collation_name, - collnsp.nspname::text AS collation_schema + collnsp.nspname::text AS collation_schema, + c.relispartition::text AS is_partition, + COALESCE(parent_class.relname, '')::text AS parent_table_name, + COALESCE(parent_ns.nspname, '')::text AS parent_schema_name, + COALESCE(pg_catalog.pg_get_expr(c.relpartbound, c.oid), '')::text AS partition_bound, + COALESCE(pg_catalog.pg_get_partkeydef(c.oid), '')::text AS partition_key FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid @@ -22,13 +27,18 @@ LEFT JOIN pg_catalog.pg_collation coll ON coll.oid = a.attcollation LEFT JOIN pg_catalog.pg_namespace collnsp ON collnsp.oid = coll.collnamespace +LEFT JOIN pg_catalog.pg_inherits inh ON + inh.inhrelid = c.oid +LEFT JOIN pg_catalog.pg_class parent_class ON + parent_class.oid = inh.inhparent +LEFT JOIN pg_catalog.pg_namespace parent_ns ON + parent_ns.oid = parent_class.relnamespace WHERE c.relkind IN ('r', 'v', 'f', 'm', 'p') AND n.nspname <> 'pg_catalog' AND n.nspname !~ '^pg_toast' AND n.nspname <> 'information_schema' AND NOT (n.nspname = 'pgdog' AND c.relname IN ('validator_bigint', 'validator_uuid', 'config')) - AND NOT c.relispartition ORDER BY n.nspname, c.relname, diff --git a/pgdog/src/backend/schema/postgres_fdw/schema.rs b/pgdog/src/backend/schema/postgres_fdw/schema.rs index 6a117b1b..3737affa 100644 --- a/pgdog/src/backend/schema/postgres_fdw/schema.rs +++ b/pgdog/src/backend/schema/postgres_fdw/schema.rs @@ -4,7 +4,10 @@ use std::collections::{HashMap, HashSet}; use tracing::{debug, warn}; use crate::{ - backend::{schema::postgres_fdw::create_foreign_table, Server, ShardingSchema}, + backend::{ + schema::postgres_fdw::{create_foreign_table, create_foreign_table_with_children}, + Server, ShardingSchema, + }, net::messages::DataRow, }; @@ -45,6 +48,16 @@ pub struct ForeignTableColumn { pub generated: String, pub collation_name: String, pub collation_schema: String, + /// Whether this table is a partition of another table. + pub is_partition: bool, + /// Parent table name if this is a partition, empty otherwise. + pub parent_table_name: String, + /// Parent schema name if this is a partition, empty otherwise. + pub parent_schema_name: String, + /// Partition bound expression, e.g. "FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')". + pub partition_bound: String, + /// Partition key definition if this is a partitioned table, e.g. "RANGE (created_at)". + pub partition_key: String, } impl From for ForeignTableColumn { @@ -59,6 +72,11 @@ impl From for ForeignTableColumn { generated: value.get_text(6).unwrap_or_default(), collation_name: value.get_text(7).unwrap_or_default(), collation_schema: value.get_text(8).unwrap_or_default(), + is_partition: value.get_text(9).unwrap_or_default() == "true", + parent_table_name: value.get_text(10).unwrap_or_default(), + parent_schema_name: value.get_text(11).unwrap_or_default(), + partition_bound: value.get_text(12).unwrap_or_default(), + partition_key: value.get_text(13).unwrap_or_default(), } } } @@ -134,7 +152,10 @@ impl ForeignTableSchema { // Create custom types (enums, domains, composite types) self.custom_types.setup(server).await?; - let mut tables = HashSet::new(); + // Build a map of parent tables to their child partitions + let children_map = self.build_children_map(); + + let mut processed_tables = HashSet::new(); let mut all_type_mismatches: Vec = Vec::new(); for ((schema, table), columns) in &self.tables { @@ -143,15 +164,36 @@ impl ForeignTableSchema { continue; } + // Skip partitions - they are handled when processing their parent + if columns.first().is_some_and(|c| c.is_partition) { + continue; + } + let dedup = (schema.clone(), table.clone()); - if !tables.contains(&dedup) { - let result = create_foreign_table(columns, sharding_schema)?; + if !processed_tables.contains(&dedup) { + // Check if this table has child partitions + let children = children_map + .get(&dedup) + .map(|child_keys| { + child_keys + .iter() + .filter_map(|key| self.tables.get(key).cloned()) + .collect::>() + }) + .unwrap_or_default(); + + let result = if children.is_empty() { + create_foreign_table(columns, sharding_schema)? + } else { + create_foreign_table_with_children(columns, sharding_schema, children)? + }; + for sql in &result.statements { debug!("[fdw::setup] {} [{}]", sql, server.addr()); server.execute(sql).await?; } all_type_mismatches.extend(result.type_mismatches); - tables.insert(dedup); + processed_tables.insert(dedup); } } @@ -202,11 +244,37 @@ impl ForeignTableSchema { &self.custom_types } + /// Get the tables map (for testing). + #[cfg(test)] + pub fn tables(&self) -> &HashMap<(String, String), Vec> { + &self.tables + } + /// Check if a table is an internal PgDog table that shouldn't be exposed via FDW. fn is_internal_table(schema: &str, table: &str) -> bool { schema == "pgdog" && matches!(table, "validator_bigint" | "validator_uuid" | "config") } + /// Build a map of parent tables to their child partition keys. + fn build_children_map(&self) -> HashMap<(String, String), Vec<(String, String)>> { + let mut children_map: HashMap<(String, String), Vec<(String, String)>> = HashMap::new(); + + for ((schema, table), columns) in &self.tables { + if let Some(first_col) = columns.first() { + if first_col.is_partition && !first_col.parent_table_name.is_empty() { + let parent_key = ( + first_col.parent_schema_name.clone(), + first_col.parent_table_name.clone(), + ); + let child_key = (schema.clone(), table.clone()); + children_map.entry(parent_key).or_default().push(child_key); + } + } + } + + children_map + } + /// Collect unique schemas from tables and custom types. fn schemas(&self) -> HashSet { self.tables @@ -271,55 +339,3 @@ impl ForeignTableColumn { Ok(result) } } - -#[cfg(test)] -mod test { - use super::*; - use crate::backend::server::test::test_server; - - #[tokio::test] - async fn test_load_foreign_table_schema() { - let mut server = test_server().await; - - server - .execute("DROP TABLE IF EXISTS test_fdw_schema") - .await - .unwrap(); - - server - .execute( - "CREATE TABLE test_fdw_schema ( - id BIGINT NOT NULL, - name VARCHAR(100) DEFAULT 'unknown', - score NUMERIC(10, 2), - created_at TIMESTAMP NOT NULL DEFAULT now() - )", - ) - .await - .unwrap(); - - let schema = ForeignTableSchema::load(&mut server).await.unwrap(); - let rows: Vec<_> = schema.tables.into_values().flatten().collect(); - - assert!(!rows.is_empty()); - - let test_rows: Vec<_> = rows - .iter() - .filter(|r| r.table_name == "test_fdw_schema") - .collect(); - assert_eq!(test_rows.len(), 4); - - let id_col = test_rows.iter().find(|r| r.column_name == "id").unwrap(); - assert!(id_col.is_not_null); - assert!(id_col.column_default.is_empty()); - - let name_col = test_rows.iter().find(|r| r.column_name == "name").unwrap(); - assert!(!name_col.is_not_null); - assert!(!name_col.column_default.is_empty()); - - server - .execute("DROP TABLE IF EXISTS test_fdw_schema") - .await - .unwrap(); - } -} diff --git a/pgdog/src/backend/schema/postgres_fdw/statement.rs b/pgdog/src/backend/schema/postgres_fdw/statement.rs index 5d67ed3a..d8f45cd7 100644 --- a/pgdog/src/backend/schema/postgres_fdw/statement.rs +++ b/pgdog/src/backend/schema/postgres_fdw/statement.rs @@ -134,6 +134,8 @@ pub struct ForeignTableBuilder<'a> { columns: &'a [ForeignTableColumn], sharding_schema: &'a ShardingSchema, type_mismatches: Vec, + /// Child partitions for two-tier partitioning (each Vec is columns for one partition). + children: Vec>, } impl<'a> ForeignTableBuilder<'a> { @@ -143,9 +145,17 @@ impl<'a> ForeignTableBuilder<'a> { columns, sharding_schema, type_mismatches: Vec::new(), + children: Vec::new(), } } + /// Add child partitions for two-tier partitioning. + /// Each Vec represents columns for one child partition. + pub fn with_children(mut self, children: Vec>) -> Self { + self.children = children; + self + } + /// Find the sharding configuration for this table, if any. /// Records any type mismatches encountered and returns a cloned config. fn find_sharded_config(&mut self) -> Option { @@ -309,11 +319,26 @@ impl<'a> ForeignTableBuilder<'a> { } /// Build a sharded table: parent table + foreign table partitions. + /// If children partitions exist, creates two-tier partitioning. fn build_sharded( &self, table_name: &str, schema_name: &str, sharded: &ShardedTable, + ) -> Result, Error> { + if self.children.is_empty() { + self.build_sharded_single_tier(table_name, schema_name, sharded) + } else { + self.build_sharded_two_tier(table_name, schema_name, sharded) + } + } + + /// Build single-tier sharding: parent table with foreign table partitions. + fn build_sharded_single_tier( + &self, + table_name: &str, + schema_name: &str, + sharded: &ShardedTable, ) -> Result, Error> { let strategy = PartitionStrategy::from_sharded_table(sharded); let mut statements = Vec::new(); @@ -333,6 +358,93 @@ impl<'a> ForeignTableBuilder<'a> { statements.push(parent); // Create foreign table partitions for each shard + self.build_foreign_partitions( + &mut statements, + table_name, + schema_name, + &qualified_name, + sharded, + )?; + + Ok(statements) + } + + /// Build two-tier sharding: parent table -> intermediate partitions -> foreign table partitions. + fn build_sharded_two_tier( + &self, + table_name: &str, + schema_name: &str, + sharded: &ShardedTable, + ) -> Result, Error> { + let shard_strategy = PartitionStrategy::from_sharded_table(sharded); + let mut statements = Vec::new(); + + // Get the parent's original partition key (e.g., "RANGE (created_at)") + let parent_partition_key = self + .columns + .first() + .map(|c| c.partition_key.as_str()) + .unwrap_or(""); + + // Create parent table with original PARTITION BY + let mut parent = String::new(); + let qualified_name = qualified_table(schema_name, table_name); + writeln!(parent, "CREATE TABLE {} (", qualified_name)?; + parent.push_str(&self.build_columns()?); + parent.push('\n'); + write!(parent, ") PARTITION BY {}", parent_partition_key)?; + statements.push(parent); + + // For each child partition, create an intermediate partition that is itself partitioned + for child_columns in &self.children { + let Some(first_col) = child_columns.first() else { + continue; + }; + + let child_table_name = &first_col.table_name; + let child_schema_name = &first_col.schema_name; + let partition_bound = &first_col.partition_bound; + + // Create intermediate partition table (partitioned by hash on shard key) + let mut intermediate = String::new(); + let qualified_child = qualified_table(child_schema_name, child_table_name); + + write!( + intermediate, + "CREATE TABLE {} PARTITION OF {} ", + qualified_child, qualified_name + )?; + write!(intermediate, "{}", partition_bound)?; + write!( + intermediate, + " PARTITION BY {} ({})", + shard_strategy.as_sql(), + quote_identifier(&sharded.column) + )?; + statements.push(intermediate); + + // Create foreign table partitions for this intermediate partition + self.build_foreign_partitions( + &mut statements, + child_table_name, + child_schema_name, + &qualified_child, + sharded, + )?; + } + + Ok(statements) + } + + /// Build foreign table partitions for each shard. + fn build_foreign_partitions( + &self, + statements: &mut Vec, + table_name: &str, + schema_name: &str, + qualified_parent: &str, + sharded: &ShardedTable, + ) -> Result<(), Error> { for shard in 0..self.sharding_schema.shards { let mut partition = String::new(); let partition_table_name = format!("{}_shard_{}", table_name, shard); @@ -342,13 +454,12 @@ impl<'a> ForeignTableBuilder<'a> { write!( partition, "CREATE FOREIGN TABLE {} PARTITION OF {} ", - qualified_partition, qualified_name + qualified_partition, qualified_parent )?; - // Partition bounds + // Partition bounds (always hash for foreign partitions in two-tier) match &sharded.mapping { None => { - // Hash partitioning write!( partition, "FOR VALUES WITH (MODULUS {}, REMAINDER {})", @@ -394,8 +505,7 @@ impl<'a> ForeignTableBuilder<'a> { statements.push(partition); } - - Ok(statements) + Ok(()) } } @@ -413,6 +523,24 @@ pub fn create_foreign_table( ForeignTableBuilder::new(columns, sharding_schema).build() } +/// Generate CREATE FOREIGN TABLE statements with two-tier partitioning. +/// +/// For sharded tables with existing partitions, creates: +/// 1. Parent table with PARTITION BY (on shard key) +/// 2. Intermediate partition tables (with original bounds, further partitioned by shard key) +/// 3. Foreign table partitions for each shard +/// +/// Each entry in `children` is columns for one child partition. +pub fn create_foreign_table_with_children( + columns: &[ForeignTableColumn], + sharding_schema: &ShardingSchema, + children: Vec>, +) -> Result { + ForeignTableBuilder::new(columns, sharding_schema) + .with_children(children) + .build() +} + #[cfg(test)] mod test { use std::collections::HashSet; @@ -432,6 +560,11 @@ mod test { generated: String::new(), collation_name: String::new(), collation_schema: String::new(), + is_partition: false, + parent_table_name: String::new(), + parent_schema_name: String::new(), + partition_bound: String::new(), + partition_key: String::new(), } } @@ -684,4 +817,157 @@ mod test { assert_eq!(quote_identifier("CamelCase"), "\"CamelCase\""); assert_eq!(quote_identifier("_valid"), "\"_valid\""); } + + fn test_partition_column( + table_name: &str, + name: &str, + col_type: &str, + parent_table: &str, + partition_bound: &str, + ) -> ForeignTableColumn { + ForeignTableColumn { + schema_name: "public".into(), + table_name: table_name.into(), + column_name: name.into(), + column_type: col_type.into(), + is_not_null: false, + column_default: String::new(), + generated: String::new(), + collation_name: String::new(), + collation_schema: String::new(), + is_partition: true, + parent_table_name: parent_table.into(), + parent_schema_name: "public".into(), + partition_bound: partition_bound.into(), + partition_key: String::new(), + } + } + + fn test_partitioned_parent_column( + name: &str, + col_type: &str, + partition_key: &str, + ) -> ForeignTableColumn { + ForeignTableColumn { + partition_key: partition_key.into(), + ..test_column(name, col_type) + } + } + + #[test] + fn test_create_foreign_table_two_tier_partitioning() { + // Parent table "orders" partitioned by RANGE on date, with children partitioned by hash across shards + let parent_columns = vec![ + test_partitioned_parent_column("id", "bigint", "RANGE (created_at)"), + test_partitioned_parent_column("created_at", "date", "RANGE (created_at)"), + test_partitioned_parent_column("data", "text", "RANGE (created_at)"), + ]; + + // Child partitions with their bounds + let partition_2024 = vec![ + test_partition_column( + "orders_2024", + "id", + "bigint", + "test_table", + "FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')", + ), + test_partition_column( + "orders_2024", + "created_at", + "date", + "test_table", + "FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')", + ), + test_partition_column( + "orders_2024", + "data", + "text", + "test_table", + "FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')", + ), + ]; + + let partition_2025 = vec![ + test_partition_column( + "orders_2025", + "id", + "bigint", + "test_table", + "FOR VALUES FROM ('2025-01-01') TO ('2026-01-01')", + ), + test_partition_column( + "orders_2025", + "created_at", + "date", + "test_table", + "FOR VALUES FROM ('2025-01-01') TO ('2026-01-01')", + ), + test_partition_column( + "orders_2025", + "data", + "text", + "test_table", + "FOR VALUES FROM ('2025-01-01') TO ('2026-01-01')", + ), + ]; + + let tables: ShardedTables = [test_sharded_table("test_table", "id")].as_slice().into(); + let schema = sharding_schema_with_tables(tables, 2); + + let result = ForeignTableBuilder::new(&parent_columns, &schema) + .with_children(vec![partition_2024, partition_2025]) + .build() + .unwrap(); + + // Expected structure: + // 1. Parent table with original PARTITION BY RANGE (created_at) + // 2. orders_2024 partition (with original bounds) that is itself PARTITION BY HASH + // 3. orders_2024_shard_0 foreign table + // 4. orders_2024_shard_1 foreign table + // 5. orders_2025 partition (with original bounds) that is itself PARTITION BY HASH + // 6. orders_2025_shard_0 foreign table + // 7. orders_2025_shard_1 foreign table + assert_eq!(result.statements.len(), 7); + + // Parent table - uses original partition key (RANGE on date) + assert!(result.statements[0].contains(r#"CREATE TABLE "public"."test_table""#)); + assert!(result.statements[0].contains("PARTITION BY RANGE (created_at)")); + + // orders_2024 intermediate partition + assert!(result.statements[1] + .contains(r#"CREATE TABLE "public"."orders_2024" PARTITION OF "public"."test_table""#)); + assert!(result.statements[1].contains("FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')")); + assert!(result.statements[1].contains(r#"PARTITION BY HASH ("id")"#)); + + // orders_2024_shard_0 foreign table + assert!(result.statements[2].contains( + r#"CREATE FOREIGN TABLE "public"."orders_2024_shard_0" PARTITION OF "public"."orders_2024""# + )); + assert!(result.statements[2].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 0)")); + assert!(result.statements[2].contains(r#"SERVER "shard_0""#)); + + // orders_2024_shard_1 foreign table + assert!(result.statements[3].contains( + r#"CREATE FOREIGN TABLE "public"."orders_2024_shard_1" PARTITION OF "public"."orders_2024""# + )); + assert!(result.statements[3].contains("FOR VALUES WITH (MODULUS 2, REMAINDER 1)")); + assert!(result.statements[3].contains(r#"SERVER "shard_1""#)); + + // orders_2025 intermediate partition + assert!(result.statements[4] + .contains(r#"CREATE TABLE "public"."orders_2025" PARTITION OF "public"."test_table""#)); + assert!(result.statements[4].contains("FOR VALUES FROM ('2025-01-01') TO ('2026-01-01')")); + assert!(result.statements[4].contains(r#"PARTITION BY HASH ("id")"#)); + + // orders_2025_shard_0 foreign table + assert!(result.statements[5].contains( + r#"CREATE FOREIGN TABLE "public"."orders_2025_shard_0" PARTITION OF "public"."orders_2025""# + )); + + // orders_2025_shard_1 foreign table + assert!(result.statements[6].contains( + r#"CREATE FOREIGN TABLE "public"."orders_2025_shard_1" PARTITION OF "public"."orders_2025""# + )); + } } diff --git a/pgdog/src/backend/schema/postgres_fdw/test/helpers.rs b/pgdog/src/backend/schema/postgres_fdw/test/helpers.rs new file mode 100644 index 00000000..ec33a07c --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/test/helpers.rs @@ -0,0 +1,280 @@ +//! Test helpers for postgres_fdw integration tests. + +use std::collections::HashSet; + +use crate::backend::pool::ShardingSchema; +use crate::backend::replication::ShardedTables; +use crate::backend::schema::postgres_fdw::{ + CreateForeignTableResult, ForeignTableBuilder, ForeignTableColumn, ForeignTableSchema, +}; +use crate::backend::Server; +use crate::config::{DataType, FlexibleType, ShardedMapping, ShardedMappingKind, ShardedTable}; +use crate::frontend::router::sharding::Mapping; + +/// Data type configuration for test tables. +#[derive(Debug, Clone, Copy)] +pub enum TestDataType { + Bigint, + Varchar, + Uuid, +} + +impl TestDataType { + pub fn name(&self) -> &'static str { + match self { + Self::Bigint => "bigint", + Self::Varchar => "varchar", + Self::Uuid => "uuid", + } + } + + pub fn sql_type(&self) -> &'static str { + match self { + Self::Bigint => "BIGINT", + Self::Varchar => "VARCHAR(100)", + Self::Uuid => "UUID", + } + } + + pub fn config_type(&self) -> DataType { + match self { + Self::Bigint => DataType::Bigint, + Self::Varchar => DataType::Varchar, + Self::Uuid => DataType::Uuid, + } + } + + pub fn flexible_values(&self) -> Vec { + match self { + Self::Bigint => vec![ + FlexibleType::Integer(1), + FlexibleType::Integer(2), + FlexibleType::Integer(3), + ], + Self::Varchar => vec![ + FlexibleType::String("us".into()), + FlexibleType::String("eu".into()), + FlexibleType::String("asia".into()), + ], + Self::Uuid => vec![ + FlexibleType::Uuid("a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11".parse().unwrap()), + FlexibleType::Uuid("b0eebc99-9c0b-4ef8-bb6d-6bb9bd380a22".parse().unwrap()), + FlexibleType::Uuid("c0eebc99-9c0b-4ef8-bb6d-6bb9bd380a33".parse().unwrap()), + ], + } + } +} + +/// Partition strategy for tests. +#[derive(Debug, Clone, Copy)] +pub enum TestPartitionStrategy { + Hash, + List, + Range, +} + +impl TestPartitionStrategy { + pub fn as_str(&self) -> &'static str { + match self { + Self::Hash => "HASH", + Self::List => "LIST", + Self::Range => "RANGE", + } + } +} + +/// Test fixture for FDW statement generation tests. +pub struct FdwTestFixture { + pub table_name: String, + pub shard_column: String, + pub data_type: TestDataType, + pub strategy: TestPartitionStrategy, + pub num_shards: usize, +} + +impl FdwTestFixture { + pub fn new( + table_name: &str, + shard_column: &str, + data_type: TestDataType, + strategy: TestPartitionStrategy, + ) -> Self { + Self { + table_name: table_name.into(), + shard_column: shard_column.into(), + data_type, + strategy, + num_shards: 2, + } + } + + pub async fn create_table(&self, server: &mut Server) -> Result<(), crate::backend::Error> { + self.cleanup(server).await?; + + let sql = format!( + "CREATE TABLE {} ( + id BIGINT NOT NULL, + {} {} NOT NULL, + data TEXT + )", + self.table_name, + self.shard_column, + self.data_type.sql_type() + ); + server.execute(&sql).await?; + Ok(()) + } + + pub async fn cleanup(&self, server: &mut Server) -> Result<(), crate::backend::Error> { + server + .execute(&format!("DROP TABLE IF EXISTS {} CASCADE", self.table_name)) + .await?; + server + .execute(&format!( + "DROP TABLE IF EXISTS {}_fdw CASCADE", + self.table_name + )) + .await?; + Ok(()) + } + + pub fn sharding_schema(&self, schema_name: &str) -> ShardingSchema { + let mapping = self.create_mapping(); + let sharded_table = ShardedTable { + database: "test".into(), + name: Some(self.table_name.clone()), + schema: Some(schema_name.into()), + column: self.shard_column.clone(), + data_type: self.data_type.config_type(), + mapping, + ..Default::default() + }; + + let tables: ShardedTables = [sharded_table].as_slice().into(); + ShardingSchema { + shards: self.num_shards, + tables, + ..Default::default() + } + } + + fn create_mapping(&self) -> Option { + match self.strategy { + TestPartitionStrategy::Hash => None, + TestPartitionStrategy::List => { + let values = self.data_type.flexible_values(); + let mappings: Vec<_> = values + .into_iter() + .enumerate() + .map(|(i, v)| ShardedMapping { + database: "test".into(), + column: self.shard_column.clone(), + table: Some(self.table_name.clone()), + kind: ShardedMappingKind::List, + values: HashSet::from([v]), + shard: i % self.num_shards, + ..Default::default() + }) + .collect(); + Mapping::new(&mappings) + } + TestPartitionStrategy::Range => { + let values = self.data_type.flexible_values(); + let mappings: Vec<_> = (0..self.num_shards) + .map(|shard| { + let (start, end) = if shard == 0 { + (None, Some(values[1].clone())) + } else { + (Some(values[1].clone()), None) + }; + ShardedMapping { + database: "test".into(), + column: self.shard_column.clone(), + table: Some(self.table_name.clone()), + kind: ShardedMappingKind::Range, + start, + end, + shard, + ..Default::default() + } + }) + .collect(); + Mapping::new(&mappings) + } + } + } + + pub fn generate_statements( + &self, + columns: &[ForeignTableColumn], + sharding_schema: &ShardingSchema, + ) -> CreateForeignTableResult { + ForeignTableBuilder::new(columns, sharding_schema) + .build() + .expect("Statement generation should succeed") + } + + fn expected_statement_count(&self) -> usize { + 1 + self.num_shards + } + + pub fn verify_statements(&self, statements: &[String]) { + assert_eq!( + statements.len(), + self.expected_statement_count(), + "Expected {} statements for {}, got {}", + self.expected_statement_count(), + self.table_name, + statements.len() + ); + + assert!( + statements[0].contains("CREATE TABLE"), + "First statement should be CREATE TABLE: {}", + statements[0] + ); + assert!( + statements[0].contains(&format!("PARTITION BY {}", self.strategy.as_str())), + "Parent should use {} partitioning: {}", + self.strategy.as_str(), + statements[0] + ); + + for (i, stmt) in statements.iter().skip(1).enumerate() { + assert!( + stmt.contains("CREATE FOREIGN TABLE"), + "Statement {} should be CREATE FOREIGN TABLE: {}", + i + 1, + stmt + ); + assert!( + stmt.contains(&format!("shard_{}", i)), + "Partition {} should reference shard_{}: {}", + i, + i, + stmt + ); + } + } + + pub async fn execute_parent_statement( + &self, + server: &mut Server, + statements: &[String], + ) -> Result<(), crate::backend::Error> { + let parent_stmt = + statements[0].replace(&self.table_name, &format!("{}_fdw", self.table_name)); + server.execute(&parent_stmt).await?; + Ok(()) + } +} + +pub fn find_table_columns<'a>( + schema: &'a ForeignTableSchema, + table_name: &str, +) -> Option<&'a Vec> { + schema + .tables() + .get(&("public".into(), table_name.into())) + .or_else(|| schema.tables().get(&("pgdog".into(), table_name.into()))) +} diff --git a/pgdog/src/backend/schema/postgres_fdw/test/mod.rs b/pgdog/src/backend/schema/postgres_fdw/test/mod.rs new file mode 100644 index 00000000..c36f0a6b --- /dev/null +++ b/pgdog/src/backend/schema/postgres_fdw/test/mod.rs @@ -0,0 +1,309 @@ +//! Integration tests for postgres_fdw statement generation. + +mod helpers; + +use helpers::{find_table_columns, FdwTestFixture, TestDataType, TestPartitionStrategy}; + +use super::ForeignTableSchema; +use crate::backend::server::test::test_server; + +/// Test a single-tier sharding scenario. +async fn run_single_tier_test(data_type: TestDataType, strategy: TestPartitionStrategy) { + let mut server = test_server().await; + let fixture = FdwTestFixture::new( + &format!( + "test_{}_{}", + strategy.as_str().to_lowercase(), + data_type.name() + ), + "shard_key", + data_type, + strategy, + ); + + // Create source table + fixture.create_table(&mut server).await.unwrap(); + + // Load schema + let schema = ForeignTableSchema::load(&mut server).await.unwrap(); + + // Find table columns + let columns = find_table_columns(&schema, &fixture.table_name).expect("Table should be loaded"); + + // Get schema name from loaded columns + let schema_name = &columns.first().unwrap().schema_name; + + // Generate statements + let sharding_schema = fixture.sharding_schema(schema_name); + let result = fixture.generate_statements(columns, &sharding_schema); + + // Verify statement structure + fixture.verify_statements(&result.statements); + + // Execute parent statement to verify SQL validity + fixture + .execute_parent_statement(&mut server, &result.statements) + .await + .expect("Parent statement should execute successfully"); + + // Cleanup + fixture.cleanup(&mut server).await.unwrap(); +} + +// Hash partitioning tests +#[tokio::test] +async fn test_hash_bigint() { + run_single_tier_test(TestDataType::Bigint, TestPartitionStrategy::Hash).await; +} + +#[tokio::test] +async fn test_hash_varchar() { + run_single_tier_test(TestDataType::Varchar, TestPartitionStrategy::Hash).await; +} + +#[tokio::test] +async fn test_hash_uuid() { + run_single_tier_test(TestDataType::Uuid, TestPartitionStrategy::Hash).await; +} + +// List partitioning tests +#[tokio::test] +async fn test_list_bigint() { + run_single_tier_test(TestDataType::Bigint, TestPartitionStrategy::List).await; +} + +#[tokio::test] +async fn test_list_varchar() { + run_single_tier_test(TestDataType::Varchar, TestPartitionStrategy::List).await; +} + +#[tokio::test] +async fn test_list_uuid() { + run_single_tier_test(TestDataType::Uuid, TestPartitionStrategy::List).await; +} + +// Range partitioning tests +#[tokio::test] +async fn test_range_bigint() { + run_single_tier_test(TestDataType::Bigint, TestPartitionStrategy::Range).await; +} + +#[tokio::test] +async fn test_range_varchar() { + run_single_tier_test(TestDataType::Varchar, TestPartitionStrategy::Range).await; +} + +#[tokio::test] +async fn test_range_uuid() { + run_single_tier_test(TestDataType::Uuid, TestPartitionStrategy::Range).await; +} + +// Existing tests refactored to use helpers + +#[tokio::test] +async fn test_load_partitioned_table_schema() { + let mut server = test_server().await; + + server + .execute("DROP TABLE IF EXISTS test_partitioned_parent CASCADE") + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_partitioned_parent ( + id BIGINT NOT NULL, + created_at DATE NOT NULL, + data TEXT + ) PARTITION BY RANGE (created_at)", + ) + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_partitioned_parent_2024 PARTITION OF test_partitioned_parent + FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')", + ) + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_partitioned_parent_2025 PARTITION OF test_partitioned_parent + FOR VALUES FROM ('2025-01-01') TO ('2026-01-01')", + ) + .await + .unwrap(); + + let schema = ForeignTableSchema::load(&mut server).await.unwrap(); + + let parent_cols = find_table_columns(&schema, "test_partitioned_parent"); + assert!(parent_cols.is_some(), "Parent table should be loaded"); + let parent_cols = parent_cols.unwrap(); + + let first_parent_col = parent_cols.first().unwrap(); + assert!(!first_parent_col.is_partition); + assert!( + first_parent_col.partition_key.contains("RANGE"), + "Parent should have RANGE partition key, got: {}", + first_parent_col.partition_key + ); + + let child_2024 = find_table_columns(&schema, "test_partitioned_parent_2024"); + assert!(child_2024.is_some(), "Child 2024 should be loaded"); + let first_child = child_2024.unwrap().first().unwrap(); + assert!(first_child.is_partition); + assert_eq!(first_child.parent_table_name, "test_partitioned_parent"); + + server + .execute("DROP TABLE IF EXISTS test_partitioned_parent CASCADE") + .await + .unwrap(); +} + +#[tokio::test] +async fn test_two_tier_partitioning() { + use crate::backend::pool::ShardingSchema; + use crate::backend::replication::ShardedTables; + use crate::config::{DataType, ShardedTable}; + + let mut server = test_server().await; + + server + .execute("DROP TABLE IF EXISTS test_two_tier CASCADE") + .await + .unwrap(); + server + .execute("DROP TABLE IF EXISTS test_two_tier_fdw CASCADE") + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_two_tier ( + id BIGINT NOT NULL, + customer_id BIGINT NOT NULL, + created_at DATE NOT NULL + ) PARTITION BY RANGE (created_at)", + ) + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_two_tier_2024 PARTITION OF test_two_tier + FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')", + ) + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_two_tier_2025 PARTITION OF test_two_tier + FOR VALUES FROM ('2025-01-01') TO ('2026-01-01')", + ) + .await + .unwrap(); + + let schema = ForeignTableSchema::load(&mut server).await.unwrap(); + + let parent_cols = find_table_columns(&schema, "test_two_tier").expect("Parent should exist"); + let child_2024 = find_table_columns(&schema, "test_two_tier_2024") + .expect("Child 2024 should exist") + .clone(); + let child_2025 = find_table_columns(&schema, "test_two_tier_2025") + .expect("Child 2025 should exist") + .clone(); + + let schema_name = &parent_cols.first().unwrap().schema_name; + + let sharded_table = ShardedTable { + database: "test".into(), + name: Some("test_two_tier".into()), + schema: Some(schema_name.clone()), + column: "customer_id".into(), + data_type: DataType::Bigint, + ..Default::default() + }; + + let tables: ShardedTables = [sharded_table].as_slice().into(); + let sharding_schema = ShardingSchema { + shards: 2, + tables, + ..Default::default() + }; + + let result = super::ForeignTableBuilder::new(parent_cols, &sharding_schema) + .with_children(vec![child_2024, child_2025]) + .build() + .unwrap(); + + // 7 statements: parent + 2*(intermediate + 2 foreign) + assert_eq!(result.statements.len(), 7); + + // Parent uses original RANGE partitioning + assert!(result.statements[0].contains("PARTITION BY RANGE")); + + // Intermediate partitions use HASH for sharding + assert!(result.statements[1].contains("PARTITION BY HASH")); + + // Execute parent and intermediate statements + let parent_stmt = result.statements[0].replace("test_two_tier", "test_two_tier_fdw"); + server.execute(&parent_stmt).await.unwrap(); + + let intermediate = result.statements[1] + .replace("test_two_tier_2024", "test_two_tier_fdw_2024") + .replace("test_two_tier", "test_two_tier_fdw"); + server.execute(&intermediate).await.unwrap(); + + server + .execute("DROP TABLE IF EXISTS test_two_tier_fdw CASCADE") + .await + .unwrap(); + server + .execute("DROP TABLE IF EXISTS test_two_tier CASCADE") + .await + .unwrap(); +} + +#[tokio::test] +async fn test_load_foreign_table_schema() { + let mut server = test_server().await; + + server + .execute("DROP TABLE IF EXISTS test_fdw_schema") + .await + .unwrap(); + + server + .execute( + "CREATE TABLE test_fdw_schema ( + id BIGINT NOT NULL, + name VARCHAR(100) DEFAULT 'unknown', + score NUMERIC(10, 2), + created_at TIMESTAMP NOT NULL DEFAULT now() + )", + ) + .await + .unwrap(); + + let schema = ForeignTableSchema::load(&mut server).await.unwrap(); + + let test_rows: Vec<_> = schema + .tables() + .values() + .flatten() + .filter(|r| r.table_name == "test_fdw_schema") + .collect(); + + assert_eq!(test_rows.len(), 4); + + let id_col = test_rows.iter().find(|r| r.column_name == "id").unwrap(); + assert!(id_col.is_not_null); + + server + .execute("DROP TABLE IF EXISTS test_fdw_schema") + .await + .unwrap(); +} diff --git a/pgdog/src/frontend/client/query_engine/connect.rs b/pgdog/src/frontend/client/query_engine/connect.rs index 6d35f8ed..f5ac6cee 100644 --- a/pgdog/src/frontend/client/query_engine/connect.rs +++ b/pgdog/src/frontend/client/query_engine/connect.rs @@ -41,8 +41,6 @@ impl QueryEngine { && (context.params.is_postgres_fdw() && !connect_route.is_ddl() || connect_route.use_fdw()); - debug!("using fdw fallback: {}", use_fdw); - let connect_route = if use_fdw { lazy_static! { static ref FDW_ROUTE: Route = Route::fdw_fallback(); @@ -144,22 +142,18 @@ impl QueryEngine { pub(super) fn transaction_route(&mut self, route: &Route) -> Result { let cluster = self.backend.cluster()?; + let mut route = route.clone(); + if cluster.shards().len() == 1 { - Ok( - Route::write(ShardWithPriority::new_override_transaction(Shard::Direct( - 0, - ))) - .with_read(route.is_read()), - ) - } else if route.is_search_path_driven() { + route.set_shard_mut(ShardWithPriority::new_override_transaction(Shard::Direct( + 0, + ))); + } else if !route.is_search_path_driven() { // Schema-based routing will only go to one shard. - Ok(route.clone()) - } else { - Ok( - Route::write(ShardWithPriority::new_override_transaction(Shard::All)) - .with_read(route.is_read()), - ) + route.set_shard_mut(ShardWithPriority::new_override_transaction(Shard::All)); } + + Ok(route) } fn debug_connected(&self, context: &QueryEngineContext<'_>, connected: bool) { diff --git a/pgdog/src/frontend/router/parser/query/mod.rs b/pgdog/src/frontend/router/parser/query/mod.rs index 1207f22a..662cbb67 100644 --- a/pgdog/src/frontend/router/parser/query/mod.rs +++ b/pgdog/src/frontend/router/parser/query/mod.rs @@ -347,7 +347,7 @@ impl QueryParser { // e.g. Parse, Describe, Flush-style flow. if !context.router_context.executable { - if let Command::Query(ref query) = command { + if let Command::Query(ref mut query) = command { if query.is_cross_shard() && statement.rewrite_plan.insert_split.is_empty() { context .shards_calculator @@ -355,13 +355,11 @@ impl QueryParser { round_robin::next() % context.shards, ))); + query.set_shard_mut(context.shards_calculator.shard().clone()); + // Since this query isn't executable and we decided // to route it to any shard, we can early return here. - return Ok(Command::Query( - query - .clone() - .with_shard(context.shards_calculator.shard().clone()), - )); + return Ok(command); } } } From 72936ef80c7099bf7f0adc6b274867bac05c5c10 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 5 Feb 2026 20:33:04 -0800 Subject: [PATCH 21/29] disable --- integration/pgdog.toml | 2 +- pgdog-config/src/sharding.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/integration/pgdog.toml b/integration/pgdog.toml index f090f005..0fcbcc08 100644 --- a/integration/pgdog.toml +++ b/integration/pgdog.toml @@ -23,7 +23,7 @@ tls_private_key = "integration/tls/key.pem" query_parser_engine = "pg_query_raw" system_catalogs = "omnisharded_sticky" reload_schema_on_ddl = true -cross_shard_backend = "fdw" +cross_shard_backend = "pgdog" [memory] net_buffer = 8096 diff --git a/pgdog-config/src/sharding.rs b/pgdog-config/src/sharding.rs index 927536a5..0dfb2324 100644 --- a/pgdog-config/src/sharding.rs +++ b/pgdog-config/src/sharding.rs @@ -404,7 +404,7 @@ impl FromStr for LoadSchema { #[serde(rename_all = "snake_case", deny_unknown_fields)] pub enum CrossShardBackend { #[default] - PgDog, + Pgdog, Fdw, Hybrid, } @@ -418,7 +418,7 @@ impl CrossShardBackend { impl Display for CrossShardBackend { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::PgDog => write!(f, "pgdog"), + Self::Pgdog => write!(f, "pgdog"), Self::Fdw => write!(f, "fdw"), Self::Hybrid => write!(f, "hybrid"), } @@ -430,7 +430,7 @@ impl FromStr for CrossShardBackend { fn from_str(s: &str) -> Result { match s { - "pgdog" => Ok(Self::PgDog), + "pgdog" => Ok(Self::Pgdog), "fdw" => Ok(Self::Fdw), "hybrid" => Ok(Self::Hybrid), _ => Err(()), From f6c1b76d62c9be0d8c0ecd6957cf147f8bc25cd6 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Fri, 6 Feb 2026 10:02:53 -0800 Subject: [PATCH 22/29] now we"re getting somewhere --- pgdog/src/backend/pool/cluster.rs | 4 + pgdog/src/backend/pool/connection/buffer.rs | 8 +- pgdog/src/backend/pool/connection/mod.rs | 22 +- .../frontend/client/query_engine/connect.rs | 21 -- pgdog/src/frontend/router/parameter_hints.rs | 11 + .../router/parser/query/fdw_fallback.rs | 247 +++++++++++++++ pgdog/src/frontend/router/parser/query/mod.rs | 9 + .../frontend/router/parser/query/select.rs | 33 +- .../frontend/router/parser/query/test/mod.rs | 1 + .../router/parser/query/test/test_ddl.rs | 36 +++ .../parser/query/test/test_fdw_fallback.rs | 287 ++++++++++++++++++ .../parser/query/test/test_subqueries.rs | 20 +- pgdog/src/frontend/router/parser/route.rs | 8 +- pgdog/src/frontend/router/parser/statement.rs | 5 + 14 files changed, 676 insertions(+), 36 deletions(-) create mode 100644 pgdog/src/frontend/router/parser/query/fdw_fallback.rs create mode 100644 pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs diff --git a/pgdog/src/backend/pool/cluster.rs b/pgdog/src/backend/pool/cluster.rs index cab6932f..2c167464 100644 --- a/pgdog/src/backend/pool/cluster.rs +++ b/pgdog/src/backend/pool/cluster.rs @@ -494,6 +494,10 @@ impl Cluster { self.cross_shard_backend } + pub fn fdw_fallback_enabled(&self) -> bool { + self.cross_shard_backend().need_fdw() + } + /// This database/user pair is responsible for schema management. pub fn schema_admin(&self) -> bool { self.schema_admin diff --git a/pgdog/src/backend/pool/connection/buffer.rs b/pgdog/src/backend/pool/connection/buffer.rs index 4b9f028c..c07420c2 100644 --- a/pgdog/src/backend/pool/connection/buffer.rs +++ b/pgdog/src/backend/pool/connection/buffer.rs @@ -12,7 +12,7 @@ use crate::{ }, net::{ messages::{DataRow, FromBytes, Message, Protocol, ToBytes, Vector}, - Decoder, + BackendKeyData, Decoder, }, }; @@ -213,7 +213,11 @@ impl Buffer { /// Take messages from buffer. pub(super) fn take(&mut self) -> Option { if self.full { - self.buffer.pop_front().and_then(|s| s.message().ok()) + self.buffer.pop_front().and_then(|s| { + s.message() + .ok() + .map(|m| m.backend(BackendKeyData::default())) + }) } else { None } diff --git a/pgdog/src/backend/pool/connection/mod.rs b/pgdog/src/backend/pool/connection/mod.rs index ca8369f5..1297e98d 100644 --- a/pgdog/src/backend/pool/connection/mod.rs +++ b/pgdog/src/backend/pool/connection/mod.rs @@ -139,10 +139,26 @@ impl Connection { /// Try to get a connection for the given route. async fn try_conn(&mut self, request: &Request, route: &Route) -> Result<(), Error> { - if let Shard::Direct(shard) = route.shard() { - let mut server = if route.is_fdw_fallback() { + if route.is_fdw_fallback() { + let server = if route.is_read() { + self.cluster()?.replica_fdw(request).await? + } else { self.cluster()?.primary_fdw(request).await? - } else if route.is_read() { + }; + + match &mut self.binding { + Binding::Direct(existing) => { + let _ = existing.replace(server); + } + + Binding::MultiShard(_, _) => { + self.binding = Binding::Direct(Some(server)); + } + + _ => (), + }; + } else if let Shard::Direct(shard) = route.shard() { + let mut server = if route.is_read() { self.cluster()?.replica(*shard, request).await? } else { self.cluster()?.primary(*shard, request).await? diff --git a/pgdog/src/frontend/client/query_engine/connect.rs b/pgdog/src/frontend/client/query_engine/connect.rs index f5ac6cee..b19cd9dc 100644 --- a/pgdog/src/frontend/client/query_engine/connect.rs +++ b/pgdog/src/frontend/client/query_engine/connect.rs @@ -1,4 +1,3 @@ -use lazy_static::lazy_static; use tokio::time::timeout; use crate::frontend::router::parser::ShardWithPriority; @@ -31,26 +30,6 @@ impl QueryEngine { let connect_route = connect_route.unwrap_or(context.client_request.route()); - // Use fdw backend if: - // - // 1. The client asked via SET pgdog.backend and the query is NOT DDL or - // 2. The query is cross-shard && NOT DDL and - // 3. FDW is enabled - // - let use_fdw = self.backend.cluster()?.cross_shard_backend().need_fdw() - && (context.params.is_postgres_fdw() && !connect_route.is_ddl() - || connect_route.use_fdw()); - - let connect_route = if use_fdw { - lazy_static! { - static ref FDW_ROUTE: Route = Route::fdw_fallback(); - } - - &FDW_ROUTE - } else { - connect_route - }; - let request = Request::new(*context.id, connect_route.is_read()); self.stats.waiting(request.created_at); diff --git a/pgdog/src/frontend/router/parameter_hints.rs b/pgdog/src/frontend/router/parameter_hints.rs index 5585ba11..0cafaef7 100644 --- a/pgdog/src/frontend/router/parameter_hints.rs +++ b/pgdog/src/frontend/router/parameter_hints.rs @@ -16,6 +16,7 @@ pub struct ParameterHints<'a> { pub pgdog_shard: Option<&'a ParameterValue>, pub pgdog_sharding_key: Option<&'a ParameterValue>, pub pgdog_role: Option<&'a ParameterValue>, + pub pgdog_cross_shard_backend: Option<&'a ParameterValue>, hooks: ParserHooks, } @@ -26,6 +27,7 @@ impl<'a> From<&'a Parameters> for ParameterHints<'a> { pgdog_shard: value.get("pgdog.shard"), pgdog_role: value.get("pgdog.role"), pgdog_sharding_key: value.get("pgdog.sharding_key"), + pgdog_cross_shard_backend: value.get("pgdog.cross_shard_backend"), hooks: ParserHooks::default(), } } @@ -112,6 +114,13 @@ impl ParameterHints<'_> { role } + + /// User said fdw + pub(crate) fn use_fdw_fallback(&self) -> bool { + self.pgdog_cross_shard_backend + .map(|s| s.as_str() == Some("fdw")) + .unwrap_or_default() + } } #[cfg(test)] @@ -148,6 +157,7 @@ mod tests { pgdog_shard: None, pgdog_sharding_key: Some(&sharding_key), pgdog_role: None, + pgdog_cross_shard_backend: None, hooks: ParserHooks::default(), }; @@ -169,6 +179,7 @@ mod tests { pgdog_shard: None, pgdog_sharding_key: Some(&sharding_key), pgdog_role: None, + pgdog_cross_shard_backend: None, hooks: ParserHooks::default(), }; diff --git a/pgdog/src/frontend/router/parser/query/fdw_fallback.rs b/pgdog/src/frontend/router/parser/query/fdw_fallback.rs new file mode 100644 index 00000000..85c22f8c --- /dev/null +++ b/pgdog/src/frontend/router/parser/query/fdw_fallback.rs @@ -0,0 +1,247 @@ +//! FDW fallback detection for queries that cannot be executed across shards. +//! +//! Determines when a query should be sent to postgres_fdw instead of being +//! executed directly by pgdog's cross-shard query engine. + +use pg_query::{protobuf::SelectStmt, Node, NodeEnum}; + +use crate::backend::Schema; +use crate::frontend::router::parser::statement::StatementParser; +use crate::frontend::router::parser::Table; +use crate::net::parameter::ParameterValue; + +/// Context for FDW fallback checking that holds schema lookup information. +pub(crate) struct FdwFallbackContext<'a> { + pub db_schema: &'a Schema, + pub user: &'a str, + pub search_path: Option<&'a ParameterValue>, +} + +impl<'a, 'b, 'c> StatementParser<'a, 'b, 'c> { + /// Check if a SELECT statement requires FDW fallback due to CTEs or subqueries + /// that reference unsharded tables without proper sharding keys. + /// + /// A CTE/subquery is considered "safe" if: + /// 1. It only references sharded or omnisharded tables, OR + /// 2. It contains a sharding key in its WHERE clause (handled by correlation) + pub(crate) fn needs_fdw_fallback_for_subqueries( + &self, + stmt: &SelectStmt, + ctx: &FdwFallbackContext, + has_sharding_key: bool, + ) -> bool { + // If the main query already has a sharding key, subqueries are considered + // correlated and inherit the sharding context + if has_sharding_key { + return false; + } + + // Check CTEs in WITH clause + if let Some(ref with_clause) = stmt.with_clause { + for cte in &with_clause.ctes { + if let Some(NodeEnum::CommonTableExpr(ref cte_expr)) = cte.node { + if let Some(ref ctequery) = cte_expr.ctequery { + if let Some(NodeEnum::SelectStmt(ref inner_select)) = ctequery.node { + if self.check_select_needs_fallback(inner_select, ctx) { + return true; + } + } + } + } + } + } + + // Check subqueries in FROM clause + for from_node in &stmt.from_clause { + if self.check_node_needs_fallback(from_node, ctx) { + return true; + } + } + + // Check subqueries in WHERE clause + if let Some(ref where_clause) = stmt.where_clause { + if self.check_node_needs_fallback(where_clause, ctx) { + return true; + } + } + + false + } + + /// Recursively check if a SELECT statement needs FDW fallback. + fn check_select_needs_fallback(&self, stmt: &SelectStmt, ctx: &FdwFallbackContext) -> bool { + // Handle UNION/INTERSECT/EXCEPT + if let Some(ref larg) = stmt.larg { + if self.check_select_needs_fallback(larg, ctx) { + return true; + } + } + if let Some(ref rarg) = stmt.rarg { + if self.check_select_needs_fallback(rarg, ctx) { + return true; + } + } + + // Check tables in FROM clause + for from_node in &stmt.from_clause { + if self.check_from_node_has_unsafe_table(from_node, ctx) { + return true; + } + } + + // Recursively check nested CTEs + if let Some(ref with_clause) = stmt.with_clause { + for cte in &with_clause.ctes { + if let Some(NodeEnum::CommonTableExpr(ref cte_expr)) = cte.node { + if let Some(ref ctequery) = cte_expr.ctequery { + if let Some(NodeEnum::SelectStmt(ref inner_select)) = ctequery.node { + if self.check_select_needs_fallback(inner_select, ctx) { + return true; + } + } + } + } + } + } + + // Recursively check subqueries in FROM + for from_node in &stmt.from_clause { + if self.check_node_needs_fallback(from_node, ctx) { + return true; + } + } + + // Check subqueries in WHERE + if let Some(ref where_clause) = stmt.where_clause { + if self.check_node_needs_fallback(where_clause, ctx) { + return true; + } + } + + false + } + + /// Check if a node contains subqueries that need FDW fallback. + fn check_node_needs_fallback(&self, node: &Node, ctx: &FdwFallbackContext) -> bool { + match &node.node { + Some(NodeEnum::RangeSubselect(subselect)) => { + if let Some(ref subquery) = subselect.subquery { + if let Some(NodeEnum::SelectStmt(ref inner_select)) = subquery.node { + return self.check_select_needs_fallback(inner_select, ctx); + } + } + false + } + Some(NodeEnum::SubLink(sublink)) => { + if let Some(ref subselect) = sublink.subselect { + if let Some(NodeEnum::SelectStmt(ref inner_select)) = subselect.node { + return self.check_select_needs_fallback(inner_select, ctx); + } + } + false + } + Some(NodeEnum::JoinExpr(join)) => { + let mut needs_fallback = false; + if let Some(ref larg) = join.larg { + needs_fallback |= self.check_node_needs_fallback(larg, ctx); + } + if let Some(ref rarg) = join.rarg { + needs_fallback |= self.check_node_needs_fallback(rarg, ctx); + } + needs_fallback + } + Some(NodeEnum::BoolExpr(bool_expr)) => { + for arg in &bool_expr.args { + if self.check_node_needs_fallback(arg, ctx) { + return true; + } + } + false + } + Some(NodeEnum::AExpr(a_expr)) => { + if let Some(ref lexpr) = a_expr.lexpr { + if self.check_node_needs_fallback(lexpr, ctx) { + return true; + } + } + if let Some(ref rexpr) = a_expr.rexpr { + if self.check_node_needs_fallback(rexpr, ctx) { + return true; + } + } + false + } + _ => false, + } + } + + /// Check if a FROM clause node references an unsafe (unsharded) table. + fn check_from_node_has_unsafe_table(&self, node: &Node, ctx: &FdwFallbackContext) -> bool { + match &node.node { + Some(NodeEnum::RangeVar(range_var)) => { + let table = Table::from(range_var); + !self.is_table_safe(&table, ctx) + } + Some(NodeEnum::JoinExpr(join)) => { + let mut has_unsafe = false; + if let Some(ref larg) = join.larg { + has_unsafe |= self.check_from_node_has_unsafe_table(larg, ctx); + } + if let Some(ref rarg) = join.rarg { + has_unsafe |= self.check_from_node_has_unsafe_table(rarg, ctx); + } + has_unsafe + } + Some(NodeEnum::RangeSubselect(_)) => { + // Subselects are checked separately for their contents + false + } + _ => false, + } + } + + /// Check if a table is "safe" (sharded or omnisharded). + fn is_table_safe(&self, table: &Table, ctx: &FdwFallbackContext) -> bool { + let sharded_tables = self.sharding_schema().tables(); + + // Check named sharded table configs + for config in sharded_tables.tables() { + if let Some(ref config_name) = config.name { + if table.name == config_name { + // Also check schema match if specified in config + if let Some(ref config_schema) = config.schema { + let config_schema_str: &str = config_schema.as_str(); + if table.schema != Some(config_schema_str) { + continue; + } + } + return true; + } + } + } + + // Check nameless configs by looking up the table in the db schema + let nameless_configs: Vec<_> = sharded_tables + .tables() + .iter() + .filter(|t| t.name.is_none()) + .collect(); + + if !nameless_configs.is_empty() { + if let Some(relation) = ctx.db_schema.table(*table, ctx.user, ctx.search_path) { + for config in &nameless_configs { + if relation.has_column(&config.column) { + return true; + } + } + } + } + + // Check if it's an omnisharded table + if sharded_tables.omnishards().contains_key(table.name) { + return true; + } + + false + } +} diff --git a/pgdog/src/frontend/router/parser/query/mod.rs b/pgdog/src/frontend/router/parser/query/mod.rs index 662cbb67..4981e0b1 100644 --- a/pgdog/src/frontend/router/parser/query/mod.rs +++ b/pgdog/src/frontend/router/parser/query/mod.rs @@ -24,6 +24,7 @@ use super::{ mod ddl; mod delete; mod explain; +mod fdw_fallback; mod plugins; mod schema_sharding; mod select; @@ -380,6 +381,14 @@ impl QueryParser { if shard.is_direct() { route.set_shard_mut(shard); } + + // User requested fdw backend. Cool, but never for DDL. + if context.router_context.parameter_hints.use_fdw_fallback() + && !route.is_ddl() + && context.router_context.cluster.fdw_fallback_enabled() + { + route.set_fdw_fallback(true); + } } // Set plugin-specified route, if available. diff --git a/pgdog/src/frontend/router/parser/query/select.rs b/pgdog/src/frontend/router/parser/query/select.rs index 56770175..d0665d69 100644 --- a/pgdog/src/frontend/router/parser/query/select.rs +++ b/pgdog/src/frontend/router/parser/query/select.rs @@ -2,6 +2,7 @@ use crate::frontend::router::parser::{ cache::Ast, from_clause::FromClause, where_clause::TablesSource, }; +use super::fdw_fallback::FdwFallbackContext; use super::*; use pgdog_config::system_catalogs; use shared::ConvergeAlgorithm; @@ -41,7 +42,7 @@ impl QueryParser { let mut shards = HashSet::new(); - let (shard, is_sharded, tables) = { + let (shard, is_sharded, tables, needs_fdw_fallback) = { let mut statement_parser = StatementParser::from_select( stmt, context.router_context.bind, @@ -50,9 +51,22 @@ impl QueryParser { ); let shard = statement_parser.shard()?; + let has_sharding_key = shard.is_some(); + + // Check FDW fallback for CTEs/subqueries + let fdw_ctx = FdwFallbackContext { + db_schema: &context.router_context.schema, + user: context.router_context.cluster.user(), + search_path: context.router_context.parameter_hints.search_path, + }; + let needs_fdw = statement_parser.needs_fdw_fallback_for_subqueries( + stmt, + &fdw_ctx, + has_sharding_key, + ); if shard.is_some() { - (shard, true, vec![]) + (shard, true, vec![], needs_fdw) } else { ( None, @@ -62,6 +76,7 @@ impl QueryParser { context.router_context.parameter_hints.search_path, ), statement_parser.extract_tables(), + needs_fdw, ) } }; @@ -199,6 +214,20 @@ impl QueryParser { // Only rewrite if query is cross-shard. if query.is_cross_shard() && context.shards > 1 { query.with_aggregate_rewrite_plan_mut(cached_ast.rewrite_plan.aggregates.clone()); + + if context.router_context.cluster.fdw_fallback_enabled() { + // Cross-shard queries with OFFSET > 0 require FDW fallback + // because OFFSET cannot be correctly applied across shards. + if limit.offset.map(|o| o > 0).unwrap_or(false) { + query.set_fdw_fallback(true); + } + + // Cross-shard queries with CTEs or subqueries that reference unsharded + // tables without a sharding key require FDW fallback. + if needs_fdw_fallback { + query.set_fdw_fallback(true); + } + } } Ok(Command::Query(query.with_write(writes))) diff --git a/pgdog/src/frontend/router/parser/query/test/mod.rs b/pgdog/src/frontend/router/parser/query/test/mod.rs index 8f84467e..a090f98e 100644 --- a/pgdog/src/frontend/router/parser/query/test/mod.rs +++ b/pgdog/src/frontend/router/parser/query/test/mod.rs @@ -27,6 +27,7 @@ pub mod test_ddl; pub mod test_delete; pub mod test_dml; pub mod test_explain; +pub mod test_fdw_fallback; pub mod test_functions; pub mod test_insert; pub mod test_rr; diff --git a/pgdog/src/frontend/router/parser/query/test/test_ddl.rs b/pgdog/src/frontend/router/parser/query/test/test_ddl.rs index bf2307d1..c3704b54 100644 --- a/pgdog/src/frontend/router/parser/query/test/test_ddl.rs +++ b/pgdog/src/frontend/router/parser/query/test/test_ddl.rs @@ -14,6 +14,10 @@ fn test_create_table() { assert!(command.route().is_write()); assert_eq!(command.route().shard(), &Shard::All); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -24,6 +28,10 @@ fn test_drop_table() { assert!(command.route().is_write()); assert_eq!(command.route().shard(), &Shard::All); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -37,6 +45,10 @@ fn test_alter_table() { assert!(command.route().is_write()); assert_eq!(command.route().shard(), &Shard::All); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -49,6 +61,10 @@ fn test_create_index() { assert!(command.route().is_write()); assert_eq!(command.route().shard(), &Shard::All); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -59,6 +75,10 @@ fn test_drop_index() { assert!(command.route().is_write()); assert_eq!(command.route().shard(), &Shard::All); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -69,6 +89,10 @@ fn test_truncate() { assert!(command.route().is_write()); assert_eq!(command.route().shard(), &Shard::All); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -79,6 +103,10 @@ fn test_create_sequence() { assert!(command.route().is_write()); assert_eq!(command.route().shard(), &Shard::All); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -88,6 +116,10 @@ fn test_vacuum() { let command = test.execute(vec![Query::new("VACUUM sharded").into()]); assert!(command.route().is_write()); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] @@ -97,6 +129,10 @@ fn test_analyze() { let command = test.execute(vec![Query::new("ANALYZE sharded").into()]); assert!(command.route().is_write()); + assert!( + !command.route().is_fdw_fallback(), + "DDL should not trigger FDW fallback" + ); } #[test] diff --git a/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs new file mode 100644 index 00000000..161b8dd3 --- /dev/null +++ b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs @@ -0,0 +1,287 @@ +use crate::net::messages::Query; + +use super::setup::*; + +// ============================================================================= +// OFFSET tests +// ============================================================================= + +/// Cross-shard SELECT with OFFSET > 0 should trigger FDW fallback +/// because OFFSET cannot be correctly applied across shards without +/// fetching all rows first. +#[test] +fn test_cross_shard_offset_triggers_fdw_fallback() { + let mut test = QueryParserTest::new(); + + // This query goes to all shards (no sharding key) with OFFSET + let command = test.execute(vec![Query::new( + "SELECT * FROM sharded ORDER BY id LIMIT 10 OFFSET 5", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Cross-shard query with OFFSET > 0 should trigger FDW fallback" + ); +} + +/// Cross-shard SELECT with OFFSET = 0 should NOT trigger FDW fallback +#[test] +fn test_cross_shard_offset_zero_no_fdw_fallback() { + let mut test = QueryParserTest::new(); + + let command = test.execute(vec![Query::new( + "SELECT * FROM sharded ORDER BY id LIMIT 10 OFFSET 0", + ) + .into()]); + + let route = command.route(); + assert!( + !route.is_fdw_fallback(), + "Cross-shard query with OFFSET = 0 should NOT trigger FDW fallback" + ); +} + +/// Direct-to-shard SELECT with OFFSET should NOT trigger FDW fallback +#[test] +fn test_direct_shard_offset_no_fdw_fallback() { + let mut test = QueryParserTest::new(); + + // This query goes to a specific shard (has sharding key) + let command = test.execute(vec![Query::new( + "SELECT * FROM sharded WHERE id = 1 ORDER BY id LIMIT 10 OFFSET 5", + ) + .into()]); + + let route = command.route(); + assert!( + !route.is_fdw_fallback(), + "Direct-to-shard query with OFFSET should NOT trigger FDW fallback" + ); +} + +// ============================================================================= +// CTE tests +// ============================================================================= + +/// CTE that references an unsharded table without a sharding key should trigger +/// FDW fallback when the main query is cross-shard. +#[test] +fn test_cte_unsharded_table_triggers_fdw_fallback() { + let mut test = QueryParserTest::new(); + + // `users` is not in the sharded tables config, making it "unsharded" + // The CTE has no sharding key, so this should trigger FDW fallback + let command = test.execute(vec![Query::new( + "WITH user_data AS (SELECT * FROM users WHERE email = 'test@test.com') + SELECT s.* FROM sharded s JOIN user_data u ON s.value = u.id", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "CTE with unsharded table and no sharding key should trigger FDW fallback" + ); +} + +/// CTE that only references sharded tables should NOT trigger FDW fallback +#[test] +fn test_cte_sharded_table_no_fdw_fallback() { + let mut test = QueryParserTest::new(); + + // CTE references only the sharded table + let command = test.execute(vec![Query::new( + "WITH shard_data AS (SELECT * FROM sharded WHERE id = 5) + SELECT * FROM shard_data", + ) + .into()]); + + let route = command.route(); + assert!( + !route.is_fdw_fallback(), + "CTE with sharded table and sharding key should NOT trigger FDW fallback" + ); +} + +/// CTE that only references omnisharded tables should NOT trigger FDW fallback +#[test] +fn test_cte_omnisharded_table_no_fdw_fallback() { + let mut test = QueryParserTest::new(); + + // CTE references only omnisharded table + let command = test.execute(vec![Query::new( + "WITH omni_data AS (SELECT * FROM sharded_omni WHERE id = 1) + SELECT * FROM omni_data", + ) + .into()]); + + let route = command.route(); + assert!( + !route.is_fdw_fallback(), + "CTE with omnisharded table should NOT trigger FDW fallback" + ); +} + +// ============================================================================= +// Subquery tests +// ============================================================================= + +/// Subquery in FROM that references unsharded table without sharding key +/// should trigger FDW fallback when main query is cross-shard. +#[test] +fn test_subquery_unsharded_table_triggers_fdw_fallback() { + let mut test = QueryParserTest::new(); + + // Subquery references unsharded table without sharding key + let command = test.execute(vec![Query::new( + "SELECT s.* FROM sharded s + JOIN (SELECT * FROM users WHERE active = true) u ON s.value = u.id", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Subquery with unsharded table should trigger FDW fallback" + ); +} + +/// Subquery with correlated reference to outer sharding key should NOT trigger +/// FDW fallback (inherits sharding context from outer query). +#[test] +fn test_subquery_correlated_no_fdw_fallback() { + let mut test = QueryParserTest::new(); + + // Correlated subquery references outer query's sharded column + let command = test.execute(vec![Query::new( + "SELECT * FROM sharded s WHERE s.id = 5 AND EXISTS ( + SELECT 1 FROM sharded_omni o WHERE o.id = s.id + )", + ) + .into()]); + + let route = command.route(); + assert!( + !route.is_fdw_fallback(), + "Correlated subquery with sharding key in outer query should NOT trigger FDW fallback" + ); +} + +// ============================================================================= +// Edge case tests +// ============================================================================= + +/// Multiple CTEs where one is safe (sharded table) and one is unsafe (unsharded +/// table) should trigger FDW fallback when there's no sharding key. +#[test] +fn test_multiple_ctes_mixed_safe_unsafe_triggers_fallback() { + let mut test = QueryParserTest::new(); + + // First CTE uses sharded table (safe), second CTE uses unsharded table (unsafe) + // No sharding key in either CTE, so unsafe CTE triggers FDW fallback + let command = test.execute(vec![Query::new( + "WITH safe_data AS (SELECT * FROM sharded), + unsafe_data AS (SELECT * FROM users WHERE active = true) + SELECT s.*, u.* FROM safe_data s, unsafe_data u", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Multiple CTEs with one unsafe table should trigger FDW fallback" + ); +} + +/// Nested subqueries where the innermost references an unsharded table +/// should trigger FDW fallback. +#[test] +fn test_deeply_nested_subquery_unsharded_triggers_fallback() { + let mut test = QueryParserTest::new(); + + // Three levels deep: outer query -> subquery -> subquery with unsharded table + let command = test.execute(vec![Query::new( + "SELECT * FROM sharded WHERE value IN ( + SELECT id FROM sharded_omni WHERE id IN ( + SELECT id FROM users WHERE active = true + ) + )", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Deeply nested subquery with unsharded table should trigger FDW fallback" + ); +} + +/// JOIN inside a subquery mixing sharded and unsharded tables should trigger +/// FDW fallback. +#[test] +fn test_subquery_join_mixed_tables_triggers_fallback() { + let mut test = QueryParserTest::new(); + + // Subquery JOINs sharded and unsharded tables + let command = test.execute(vec![Query::new( + "SELECT * FROM sharded WHERE id IN ( + SELECT s.id FROM sharded s + JOIN users u ON s.value = u.id + WHERE u.active = true + )", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Subquery JOIN mixing sharded and unsharded tables should trigger FDW fallback" + ); +} + +/// OFFSET with bind parameter should trigger FDW fallback when value > 0. +#[test] +fn test_offset_bind_parameter_triggers_fallback() { + use crate::net::messages::Parameter; + + let mut test = QueryParserTest::new(); + + // OFFSET using $1 bind parameter with value 5 + let command = test.execute(vec![ + Parse::named( + "__offset_test", + "SELECT * FROM sharded ORDER BY id LIMIT 10 OFFSET $1", + ) + .into(), + Bind::new_params("__offset_test", &[Parameter::new(b"5")]).into(), + Execute::new().into(), + Sync.into(), + ]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Cross-shard query with OFFSET bind parameter > 0 should trigger FDW fallback" + ); +} + +/// Schema-qualified unsharded table should still trigger FDW fallback. +#[test] +fn test_schema_qualified_unsharded_triggers_fallback() { + let mut test = QueryParserTest::new(); + + // Using schema-qualified name for unsharded table + let command = test.execute(vec![Query::new( + "WITH user_data AS (SELECT * FROM public.users WHERE email = 'test@test.com') + SELECT s.* FROM sharded s JOIN user_data u ON s.value = u.id", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Schema-qualified unsharded table should trigger FDW fallback" + ); +} diff --git a/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs b/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs index db9ef150..b2d6b16d 100644 --- a/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs +++ b/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs @@ -7,13 +7,17 @@ use super::setup::{QueryParserTest, *}; fn test_subquery_in_where() { let mut test = QueryParserTest::new(); + // Subquery references `other_table` which is unsharded, so FDW fallback is triggered let command = test.execute(vec![Query::new( "SELECT * FROM sharded WHERE id IN (SELECT id FROM other_table WHERE status = 'active')", ) .into()]); assert!(command.route().is_read()); - assert_eq!(command.route().shard(), &Shard::All); + assert!( + command.route().is_fdw_fallback(), + "Subquery with unsharded table should trigger FDW fallback" + ); } #[test] @@ -57,6 +61,7 @@ fn test_scalar_subquery() { fn test_subquery_with_sharding_key() { let mut test = QueryParserTest::new(); + // Subquery references `other` which is unsharded, so FDW fallback is triggered let command = test.execute(vec![ Parse::named( "__test_sub", @@ -68,21 +73,28 @@ fn test_subquery_with_sharding_key() { Sync.into(), ]); - // Can't route to specific shard because we don't know the subquery result - assert_eq!(command.route().shard(), &Shard::All); + assert!( + command.route().is_fdw_fallback(), + "Subquery with unsharded table should trigger FDW fallback" + ); } #[test] fn test_nested_subqueries() { let mut test = QueryParserTest::new(); + // Nested subqueries reference `other` and `statuses` which are unsharded, + // so FDW fallback is triggered let command = test.execute(vec![Query::new( "SELECT * FROM sharded WHERE id IN (SELECT id FROM other WHERE status IN (SELECT status FROM statuses))", ) .into()]); assert!(command.route().is_read()); - assert_eq!(command.route().shard(), &Shard::All); + assert!( + command.route().is_fdw_fallback(), + "Nested subqueries with unsharded tables should trigger FDW fallback" + ); } #[test] diff --git a/pgdog/src/frontend/router/parser/route.rs b/pgdog/src/frontend/router/parser/route.rs index 8f4cfe73..91827d48 100644 --- a/pgdog/src/frontend/router/parser/route.rs +++ b/pgdog/src/frontend/router/parser/route.rs @@ -163,8 +163,8 @@ impl Route { !self.is_read() } - pub fn is_fdw_fallback(&self) -> bool { - self.fdw_fallback + pub fn set_fdw_fallback(&mut self, fallback: bool) { + self.fdw_fallback = fallback; } /// Get shard if any. @@ -193,8 +193,8 @@ impl Route { self.is_all_shards() || self.is_multi_shard() } - pub fn use_fdw(&self) -> bool { - self.is_cross_shard() && !self.is_ddl() + pub fn is_fdw_fallback(&self) -> bool { + self.fdw_fallback } pub fn order_by(&self) -> &[OrderBy] { diff --git a/pgdog/src/frontend/router/parser/statement.rs b/pgdog/src/frontend/router/parser/statement.rs index 7e962ddf..ba30a53d 100644 --- a/pgdog/src/frontend/router/parser/statement.rs +++ b/pgdog/src/frontend/router/parser/statement.rs @@ -245,6 +245,11 @@ impl<'a, 'b, 'c> StatementParser<'a, 'b, 'c> { self } + /// Get the sharding schema reference. + pub fn sharding_schema(&self) -> &ShardingSchema { + self.schema + } + pub fn from_select( stmt: &'a SelectStmt, bind: Option<&'b Bind>, From aecdb87ef5b71ad8e2566c51052a2a2e1caf366a Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Fri, 6 Feb 2026 10:11:23 -0800 Subject: [PATCH 23/29] fix tests --- pgdog/src/backend/pool/cluster.rs | 4 ++ .../router/parser/query/test/setup.rs | 9 +++- .../parser/query/test/test_fdw_fallback.rs | 50 ++++++++++++++----- .../parser/query/test/test_subqueries.rs | 6 +-- 4 files changed, 51 insertions(+), 18 deletions(-) diff --git a/pgdog/src/backend/pool/cluster.rs b/pgdog/src/backend/pool/cluster.rs index 2c167464..95f8a4fd 100644 --- a/pgdog/src/backend/pool/cluster.rs +++ b/pgdog/src/backend/pool/cluster.rs @@ -805,6 +805,10 @@ mod test { pub fn set_read_write_strategy(&mut self, rw_strategy: ReadWriteStrategy) { self.rw_strategy = rw_strategy; } + + pub fn set_cross_shard_backend(&mut self, backend: pgdog_config::CrossShardBackend) { + self.cross_shard_backend = backend; + } } #[test] diff --git a/pgdog/src/frontend/router/parser/query/test/setup.rs b/pgdog/src/frontend/router/parser/query/test/setup.rs index 36457e86..e15d22a2 100644 --- a/pgdog/src/frontend/router/parser/query/test/setup.rs +++ b/pgdog/src/frontend/router/parser/query/test/setup.rs @@ -1,6 +1,6 @@ use std::ops::Deref; -use pgdog_config::ConfigAndUsers; +use pgdog_config::{ConfigAndUsers, CrossShardBackend}; use crate::{ backend::Cluster, @@ -101,7 +101,12 @@ impl QueryParserTest { self } - /// Startup parameters. + /// Enable FDW fallback (sets cross_shard_backend to Hybrid). + pub(crate) fn with_fdw_fallback(mut self) -> Self { + self.cluster + .set_cross_shard_backend(CrossShardBackend::Hybrid); + self + } /// Execute a request and return the command (panics on error). pub(crate) fn execute(&mut self, request: Vec) -> Command { diff --git a/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs index 161b8dd3..c65228f6 100644 --- a/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs +++ b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs @@ -2,6 +2,30 @@ use crate::net::messages::Query; use super::setup::*; +// ============================================================================= +// Config verification tests +// ============================================================================= + +/// FDW fallback should NOT be triggered when cross_shard_backend is not +/// configured for FDW (default is Pgdog). +#[test] +fn test_fdw_fallback_requires_config() { + // Without with_fdw_fallback(), cross_shard_backend defaults to Pgdog + let mut test = QueryParserTest::new(); + + // This query would normally trigger FDW fallback (OFFSET > 0) + let command = test.execute(vec![Query::new( + "SELECT * FROM sharded ORDER BY id LIMIT 10 OFFSET 5", + ) + .into()]); + + let route = command.route(); + assert!( + !route.is_fdw_fallback(), + "FDW fallback should NOT be triggered when cross_shard_backend is Pgdog (default)" + ); +} + // ============================================================================= // OFFSET tests // ============================================================================= @@ -11,7 +35,7 @@ use super::setup::*; /// fetching all rows first. #[test] fn test_cross_shard_offset_triggers_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // This query goes to all shards (no sharding key) with OFFSET let command = test.execute(vec![Query::new( @@ -29,7 +53,7 @@ fn test_cross_shard_offset_triggers_fdw_fallback() { /// Cross-shard SELECT with OFFSET = 0 should NOT trigger FDW fallback #[test] fn test_cross_shard_offset_zero_no_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); let command = test.execute(vec![Query::new( "SELECT * FROM sharded ORDER BY id LIMIT 10 OFFSET 0", @@ -46,7 +70,7 @@ fn test_cross_shard_offset_zero_no_fdw_fallback() { /// Direct-to-shard SELECT with OFFSET should NOT trigger FDW fallback #[test] fn test_direct_shard_offset_no_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // This query goes to a specific shard (has sharding key) let command = test.execute(vec![Query::new( @@ -69,7 +93,7 @@ fn test_direct_shard_offset_no_fdw_fallback() { /// FDW fallback when the main query is cross-shard. #[test] fn test_cte_unsharded_table_triggers_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // `users` is not in the sharded tables config, making it "unsharded" // The CTE has no sharding key, so this should trigger FDW fallback @@ -89,7 +113,7 @@ fn test_cte_unsharded_table_triggers_fdw_fallback() { /// CTE that only references sharded tables should NOT trigger FDW fallback #[test] fn test_cte_sharded_table_no_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // CTE references only the sharded table let command = test.execute(vec![Query::new( @@ -108,7 +132,7 @@ fn test_cte_sharded_table_no_fdw_fallback() { /// CTE that only references omnisharded tables should NOT trigger FDW fallback #[test] fn test_cte_omnisharded_table_no_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // CTE references only omnisharded table let command = test.execute(vec![Query::new( @@ -132,7 +156,7 @@ fn test_cte_omnisharded_table_no_fdw_fallback() { /// should trigger FDW fallback when main query is cross-shard. #[test] fn test_subquery_unsharded_table_triggers_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Subquery references unsharded table without sharding key let command = test.execute(vec![Query::new( @@ -152,7 +176,7 @@ fn test_subquery_unsharded_table_triggers_fdw_fallback() { /// FDW fallback (inherits sharding context from outer query). #[test] fn test_subquery_correlated_no_fdw_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Correlated subquery references outer query's sharded column let command = test.execute(vec![Query::new( @@ -177,7 +201,7 @@ fn test_subquery_correlated_no_fdw_fallback() { /// table) should trigger FDW fallback when there's no sharding key. #[test] fn test_multiple_ctes_mixed_safe_unsafe_triggers_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // First CTE uses sharded table (safe), second CTE uses unsharded table (unsafe) // No sharding key in either CTE, so unsafe CTE triggers FDW fallback @@ -199,7 +223,7 @@ fn test_multiple_ctes_mixed_safe_unsafe_triggers_fallback() { /// should trigger FDW fallback. #[test] fn test_deeply_nested_subquery_unsharded_triggers_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Three levels deep: outer query -> subquery -> subquery with unsharded table let command = test.execute(vec![Query::new( @@ -222,7 +246,7 @@ fn test_deeply_nested_subquery_unsharded_triggers_fallback() { /// FDW fallback. #[test] fn test_subquery_join_mixed_tables_triggers_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Subquery JOINs sharded and unsharded tables let command = test.execute(vec![Query::new( @@ -246,7 +270,7 @@ fn test_subquery_join_mixed_tables_triggers_fallback() { fn test_offset_bind_parameter_triggers_fallback() { use crate::net::messages::Parameter; - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // OFFSET using $1 bind parameter with value 5 let command = test.execute(vec![ @@ -270,7 +294,7 @@ fn test_offset_bind_parameter_triggers_fallback() { /// Schema-qualified unsharded table should still trigger FDW fallback. #[test] fn test_schema_qualified_unsharded_triggers_fallback() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Using schema-qualified name for unsharded table let command = test.execute(vec![Query::new( diff --git a/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs b/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs index b2d6b16d..3d9fe323 100644 --- a/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs +++ b/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs @@ -5,7 +5,7 @@ use super::setup::{QueryParserTest, *}; #[test] fn test_subquery_in_where() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Subquery references `other_table` which is unsharded, so FDW fallback is triggered let command = test.execute(vec![Query::new( @@ -59,7 +59,7 @@ fn test_scalar_subquery() { #[test] fn test_subquery_with_sharding_key() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Subquery references `other` which is unsharded, so FDW fallback is triggered let command = test.execute(vec![ @@ -81,7 +81,7 @@ fn test_subquery_with_sharding_key() { #[test] fn test_nested_subqueries() { - let mut test = QueryParserTest::new(); + let mut test = QueryParserTest::new().with_fdw_fallback(); // Nested subqueries reference `other` and `statuses` which are unsharded, // so FDW fallback is triggered From e034c837a0e5b205ad5fa23928e26024089b5ecd Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Fri, 6 Feb 2026 10:52:14 -0800 Subject: [PATCH 24/29] window functions --- .../router/parser/query/fdw_fallback.rs | 100 +++++++++++++++++- .../parser/query/test/test_fdw_fallback.rs | 80 ++++++++++++++ 2 files changed, 175 insertions(+), 5 deletions(-) diff --git a/pgdog/src/frontend/router/parser/query/fdw_fallback.rs b/pgdog/src/frontend/router/parser/query/fdw_fallback.rs index 85c22f8c..7f1a1216 100644 --- a/pgdog/src/frontend/router/parser/query/fdw_fallback.rs +++ b/pgdog/src/frontend/router/parser/query/fdw_fallback.rs @@ -18,12 +18,12 @@ pub(crate) struct FdwFallbackContext<'a> { } impl<'a, 'b, 'c> StatementParser<'a, 'b, 'c> { - /// Check if a SELECT statement requires FDW fallback due to CTEs or subqueries - /// that reference unsharded tables without proper sharding keys. + /// Check if a SELECT statement requires FDW fallback due to CTEs, subqueries, + /// or window functions that cannot be correctly executed across shards. /// - /// A CTE/subquery is considered "safe" if: - /// 1. It only references sharded or omnisharded tables, OR - /// 2. It contains a sharding key in its WHERE clause (handled by correlation) + /// Returns true if: + /// 1. CTEs/subqueries reference unsharded tables without sharding keys + /// 2. Window functions are present (can't be merged across shards) pub(crate) fn needs_fdw_fallback_for_subqueries( &self, stmt: &SelectStmt, @@ -36,6 +36,11 @@ impl<'a, 'b, 'c> StatementParser<'a, 'b, 'c> { return false; } + // Check for window functions in target list + if self.has_window_functions(stmt) { + return true; + } + // Check CTEs in WITH clause if let Some(ref with_clause) = stmt.with_clause { for cte in &with_clause.ctes { @@ -68,6 +73,86 @@ impl<'a, 'b, 'c> StatementParser<'a, 'b, 'c> { false } + /// Check if a SELECT statement contains window functions. + fn has_window_functions(&self, stmt: &SelectStmt) -> bool { + for target in &stmt.target_list { + if self.node_has_window_function(target) { + return true; + } + } + false + } + + /// Recursively check if a node contains a window function. + fn node_has_window_function(&self, node: &Node) -> bool { + match &node.node { + Some(NodeEnum::ResTarget(res_target)) => { + if let Some(ref val) = res_target.val { + return self.node_has_window_function(val); + } + false + } + Some(NodeEnum::FuncCall(func)) => { + // Window function has an OVER clause + func.over.is_some() + } + Some(NodeEnum::AExpr(a_expr)) => { + if let Some(ref lexpr) = a_expr.lexpr { + if self.node_has_window_function(lexpr) { + return true; + } + } + if let Some(ref rexpr) = a_expr.rexpr { + if self.node_has_window_function(rexpr) { + return true; + } + } + false + } + Some(NodeEnum::TypeCast(type_cast)) => { + if let Some(ref arg) = type_cast.arg { + return self.node_has_window_function(arg); + } + false + } + Some(NodeEnum::CoalesceExpr(coalesce)) => { + for arg in &coalesce.args { + if self.node_has_window_function(arg) { + return true; + } + } + false + } + Some(NodeEnum::CaseExpr(case_expr)) => { + if let Some(ref arg) = case_expr.arg { + if self.node_has_window_function(arg) { + return true; + } + } + if let Some(ref defresult) = case_expr.defresult { + if self.node_has_window_function(defresult) { + return true; + } + } + for when in &case_expr.args { + if self.node_has_window_function(when) { + return true; + } + } + false + } + Some(NodeEnum::CaseWhen(case_when)) => { + if let Some(ref result) = case_when.result { + if self.node_has_window_function(result) { + return true; + } + } + false + } + _ => false, + } + } + /// Recursively check if a SELECT statement needs FDW fallback. fn check_select_needs_fallback(&self, stmt: &SelectStmt, ctx: &FdwFallbackContext) -> bool { // Handle UNION/INTERSECT/EXCEPT @@ -82,6 +167,11 @@ impl<'a, 'b, 'c> StatementParser<'a, 'b, 'c> { } } + // Check for window functions + if self.has_window_functions(stmt) { + return true; + } + // Check tables in FROM clause for from_node in &stmt.from_clause { if self.check_from_node_has_unsafe_table(from_node, ctx) { diff --git a/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs index c65228f6..f2823698 100644 --- a/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs +++ b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs @@ -309,3 +309,83 @@ fn test_schema_qualified_unsharded_triggers_fallback() { "Schema-qualified unsharded table should trigger FDW fallback" ); } + +// ============================================================================= +// Window function tests +// ============================================================================= + +/// Cross-shard query with window function should trigger FDW fallback +/// because window functions can't be correctly merged across shards. +#[test] +fn test_window_function_cross_shard_triggers_fallback() { + let mut test = QueryParserTest::new().with_fdw_fallback(); + + // ROW_NUMBER() without sharding key = cross-shard + let command = test.execute(vec![Query::new( + "SELECT id, ROW_NUMBER() OVER (ORDER BY id) FROM sharded", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Cross-shard query with window function should trigger FDW fallback" + ); +} + +/// Direct-to-shard query with window function should NOT trigger FDW fallback. +#[test] +fn test_window_function_single_shard_no_fallback() { + let mut test = QueryParserTest::new().with_fdw_fallback(); + + // ROW_NUMBER() with sharding key = single shard, no fallback needed + let command = test.execute(vec![Query::new( + "SELECT id, ROW_NUMBER() OVER (ORDER BY id) FROM sharded WHERE id = 1", + ) + .into()]); + + let route = command.route(); + assert!( + !route.is_fdw_fallback(), + "Single-shard query with window function should NOT trigger FDW fallback" + ); +} + +/// Multiple window functions in cross-shard query should trigger FDW fallback. +#[test] +fn test_multiple_window_functions_triggers_fallback() { + let mut test = QueryParserTest::new().with_fdw_fallback(); + + let command = test.execute(vec![Query::new( + "SELECT id, + ROW_NUMBER() OVER (ORDER BY id) as rn, + RANK() OVER (PARTITION BY email ORDER BY id) as rnk + FROM sharded", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Cross-shard query with multiple window functions should trigger FDW fallback" + ); +} + +/// Window function in subquery should trigger FDW fallback for cross-shard. +#[test] +fn test_window_function_in_subquery_triggers_fallback() { + let mut test = QueryParserTest::new().with_fdw_fallback(); + + let command = test.execute(vec![Query::new( + "SELECT * FROM ( + SELECT id, ROW_NUMBER() OVER (ORDER BY id) as rn FROM sharded + ) sub WHERE rn <= 10", + ) + .into()]); + + let route = command.route(); + assert!( + route.is_fdw_fallback(), + "Cross-shard subquery with window function should trigger FDW fallback" + ); +} From cc4e3ba287f1b07eec71e03072d98391ba60d1c5 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Fri, 6 Feb 2026 11:30:07 -0800 Subject: [PATCH 25/29] run as postgres --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index eaa89257..a9bb234b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,6 +36,8 @@ RUN apt update && apt install -y postgresql-${PSQL_VERSION} && \ COPY --from=builder /build/target/release/pgdog /usr/local/bin/pgdog +RUN mkdir -p /pgdog && chown postgres:postgres /pgdog WORKDIR /pgdog +USER postgres STOPSIGNAL SIGINT CMD ["/usr/local/bin/pgdog"] From c969ebf5dcad0d115211a0df3118a2f0e3f0ef38 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Fri, 6 Feb 2026 12:20:14 -0800 Subject: [PATCH 26/29] fix: handle auth passthrough --- pgdog/src/backend/databases.rs | 64 +++++++----------------- pgdog/src/backend/pool/connection/mod.rs | 2 +- pgdog/src/frontend/client/mod.rs | 2 +- 3 files changed, 19 insertions(+), 49 deletions(-) diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index d20d6296..84bb8ce1 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -11,7 +11,7 @@ use tracing::{debug, error, info, warn}; use crate::backend::fdw::PostgresLauncher; use crate::backend::replication::ShardedSchemas; -use crate::config::PoolerMode; +use crate::config::{set, PoolerMode}; use crate::frontend::client::query_engine::two_pc::Manager; use crate::frontend::router::parser::Cache; use crate::frontend::router::sharding::mapping::mapping_valid; @@ -159,35 +159,32 @@ pub fn reload() -> Result<(), Error> { } /// Add new user to pool. -pub(crate) fn add(mut user: crate::config::User) { +pub(crate) fn add(user: crate::config::User) -> Result<(), Error> { + use std::ops::Deref; + // One user at a time. - let _lock = lock(); + let lock = lock(); debug!( "adding user \"{}\" for database \"{}\" via auth passthrough", user.name, user.database ); - let config = config(); - for existing in &config.users.users { + let mut config = config().deref().clone(); + for existing in &mut config.users.users { if existing.name == user.name && existing.database == user.database { - let mut existing = existing.clone(); - existing.password = user.password.clone(); - user = existing; - } - } - let pool = new_pool(&user, &config.config); - if let Some((user, cluster)) = pool { - let databases = (*databases()).clone(); - let (added, databases) = databases.add(user, cluster); - if added { - // Launch the new pool (idempotent). - databases.launch(); - // Don't use replace_databases because Arc refers to the same DBs, - // and we'll shut them down. - DATABASES.store(Arc::new(databases)); + if existing.password().is_empty() { + existing.password = user.password.clone(); + } } } + + set(config)?; + drop(lock); + + reload_from_existing()?; + + Ok(()) } /// Database/user pair that identifies a database cluster pool. @@ -229,15 +226,6 @@ impl ToUser for (&str, Option<&str>) { } } -// impl ToUser for &pgdog_config::User { -// fn to_user(&self) -> User { -// User { -// user: self.name.clone(), -// database: self.database.clone(), -// } -// } -// } - /// Databases. #[derive(Default, Clone)] pub struct Databases { @@ -248,24 +236,6 @@ pub struct Databases { } impl Databases { - /// Add new connection pools to the databases. - fn add(mut self, user: User, cluster: Cluster) -> (bool, Databases) { - match self.databases.entry(user) { - Entry::Vacant(e) => { - e.insert(cluster); - (true, self) - } - Entry::Occupied(mut e) => { - if e.get().password().is_empty() { - e.insert(cluster); - (true, self) - } else { - (false, self) - } - } - } - } - /// Check if a cluster exists, quickly. pub fn exists(&self, user: impl ToUser) -> bool { if let Some(cluster) = self.databases.get(&user.to_user()) { diff --git a/pgdog/src/backend/pool/connection/mod.rs b/pgdog/src/backend/pool/connection/mod.rs index 1297e98d..c7699c6f 100644 --- a/pgdog/src/backend/pool/connection/mod.rs +++ b/pgdog/src/backend/pool/connection/mod.rs @@ -347,7 +347,7 @@ impl Connection { if config().config.general.passthrough_auth() && !databases().exists(user) { if let Some(ref passthrough_password) = self.passthrough_password { let new_user = User::new(&self.user, passthrough_password, &self.database); - databases::add(new_user); + databases::add(new_user)?; } } diff --git a/pgdog/src/frontend/client/mod.rs b/pgdog/src/frontend/client/mod.rs index 3a07451b..a0232588 100644 --- a/pgdog/src/frontend/client/mod.rs +++ b/pgdog/src/frontend/client/mod.rs @@ -175,7 +175,7 @@ impl Client { if !exists { let user = user_from_params(¶ms, &password).ok(); if let Some(user) = user { - databases::add(user); + databases::add(user)?; } } password.password().map(|p| p.to_owned()) From a6c372e99f0e41c3a6f2a5255d1ac2714adb24e9 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Fri, 6 Feb 2026 13:39:38 -0800 Subject: [PATCH 27/29] fix: more passthrough auth --- pgdog/src/backend/databases.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pgdog/src/backend/databases.rs b/pgdog/src/backend/databases.rs index 84bb8ce1..1b3c9554 100644 --- a/pgdog/src/backend/databases.rs +++ b/pgdog/src/backend/databases.rs @@ -1,6 +1,6 @@ //! Databases behind pgDog. -use std::collections::{hash_map::Entry, HashMap}; +use std::collections::HashMap; use std::sync::Arc; use arc_swap::ArcSwap; @@ -171,14 +171,20 @@ pub(crate) fn add(user: crate::config::User) -> Result<(), Error> { ); let mut config = config().deref().clone(); + let mut found = false; for existing in &mut config.users.users { if existing.name == user.name && existing.database == user.database { + found = true; if existing.password().is_empty() { existing.password = user.password.clone(); } } } + if !found { + config.users.users.push(user); + } + set(config)?; drop(lock); From 89c2d53e28fbc905baf28afecdef2ffde152844d Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Tue, 17 Feb 2026 17:30:58 -0800 Subject: [PATCH 28/29] Make fdw backend explicit only --- pgdog/src/frontend/router/parameter_hints.rs | 18 +++-- pgdog/src/frontend/router/parser/cache/ast.rs | 13 ++- pgdog/src/frontend/router/parser/comment.rs | 81 +++++++++++-------- .../router/parser/query/fdw_fallback.rs | 1 + .../frontend/router/parser/query/select.rs | 44 +++------- .../parser/query/test/test_fdw_fallback.rs | 44 ++++++++++ .../parser/query/test/test_subqueries.rs | 12 +-- pgdog/src/frontend/router/parser/route.rs | 5 ++ 8 files changed, 139 insertions(+), 79 deletions(-) diff --git a/pgdog/src/frontend/router/parameter_hints.rs b/pgdog/src/frontend/router/parameter_hints.rs index 0cafaef7..deae1023 100644 --- a/pgdog/src/frontend/router/parameter_hints.rs +++ b/pgdog/src/frontend/router/parameter_hints.rs @@ -1,4 +1,6 @@ -use pgdog_config::Role; +use std::str::FromStr; + +use pgdog_config::{CrossShardBackend, Role}; use super::parser::Error; use crate::{ @@ -115,11 +117,17 @@ impl ParameterHints<'_> { role } - /// User said fdw + /// User said use fdw. pub(crate) fn use_fdw_fallback(&self) -> bool { - self.pgdog_cross_shard_backend - .map(|s| s.as_str() == Some("fdw")) - .unwrap_or_default() + if let Some(ref val) = self.pgdog_cross_shard_backend { + if let Some(s) = val.as_str() { + if let Ok(fdw) = CrossShardBackend::from_str(s) { + return fdw.need_fdw(); + } + } + } + + false } } diff --git a/pgdog/src/frontend/router/parser/cache/ast.rs b/pgdog/src/frontend/router/parser/cache/ast.rs index 2be0e0fb..ff72bc7c 100644 --- a/pgdog/src/frontend/router/parser/cache/ast.rs +++ b/pgdog/src/frontend/router/parser/cache/ast.rs @@ -1,5 +1,5 @@ use pg_query::{parse, parse_raw, protobuf::ObjectType, NodeEnum, NodeRef, ParseResult}; -use pgdog_config::QueryParserEngine; +use pgdog_config::{CrossShardBackend, QueryParserEngine}; use std::fmt::Debug; use std::time::Instant; use std::{collections::HashSet, ops::Deref}; @@ -12,6 +12,7 @@ use super::super::{ }; use super::{Fingerprint, Stats}; use crate::backend::schema::Schema; +use crate::frontend::router::parser::comment::CommentRoute; use crate::frontend::router::parser::rewrite::statement::RewritePlan; use crate::frontend::{BufferedQuery, PreparedStatements}; use crate::net::parameter::ParameterValue; @@ -41,6 +42,8 @@ pub struct AstInner { pub rewrite_plan: RewritePlan, /// Fingerprint. pub fingerprint: Fingerprint, + /// Cross-shard backend. + pub cross_shard_backend: Option, } impl AstInner { @@ -53,6 +56,7 @@ impl AstInner { comment_shard: None, rewrite_plan: RewritePlan::default(), fingerprint: Fingerprint::default(), + cross_shard_backend: None, } } } @@ -81,7 +85,11 @@ impl Ast { QueryParserEngine::PgQueryRaw => parse_raw(query), } .map_err(Error::PgQuery)?; - let (comment_shard, comment_role) = comment(query, schema)?; + let CommentRoute { + shard: comment_shard, + role: comment_role, + cross_shard_backend, + } = comment(query, schema)?; let fingerprint = Fingerprint::new(query, schema.query_parser_engine).map_err(Error::PgQuery)?; @@ -116,6 +124,7 @@ impl Ast { ast, rewrite_plan, fingerprint, + cross_shard_backend, }), }) } diff --git a/pgdog/src/frontend/router/parser/comment.rs b/pgdog/src/frontend/router/parser/comment.rs index a87883ad..15d49a2f 100644 --- a/pgdog/src/frontend/router/parser/comment.rs +++ b/pgdog/src/frontend/router/parser/comment.rs @@ -1,7 +1,7 @@ use once_cell::sync::Lazy; use pg_query::scan_raw; use pg_query::{protobuf::Token, scan}; -use pgdog_config::QueryParserEngine; +use pgdog_config::{CrossShardBackend, QueryParserEngine}; use regex::Regex; use crate::backend::ShardingSchema; @@ -16,6 +16,8 @@ static SHARDING_KEY: Lazy = Lazy::new(|| { Regex::new(r#"pgdog_sharding_key: *(?:"([^"]*)"|'([^']*)'|([0-9a-zA-Z-]+))"#).unwrap() }); static ROLE: Lazy = Lazy::new(|| Regex::new(r#"pgdog_role: *(primary|replica)"#).unwrap()); +static BACKEND: Lazy = + Lazy::new(|| Regex::new(r#"pgdog_cross_shard_backend: fdw"#).unwrap()); fn get_matched_value<'a>(caps: &'a regex::Captures<'a>) -> Option<&'a str> { caps.get(1) @@ -24,6 +26,13 @@ fn get_matched_value<'a>(caps: &'a regex::Captures<'a>) -> Option<&'a str> { .map(|m| m.as_str()) } +#[derive(Debug, Clone, PartialEq, Default)] +pub struct CommentRoute { + pub shard: Option, + pub role: Option, + pub cross_shard_backend: Option, +} + /// Extract shard number from a comment. /// /// Comment style uses the C-style comments (not SQL comments!) @@ -31,16 +40,13 @@ fn get_matched_value<'a>(caps: &'a regex::Captures<'a>) -> Option<&'a str> { /// /// See [`SHARD`] and [`SHARDING_KEY`] for the style of comment we expect. /// -pub fn comment( - query: &str, - schema: &ShardingSchema, -) -> Result<(Option, Option), Error> { +pub fn comment(query: &str, schema: &ShardingSchema) -> Result { let tokens = match schema.query_parser_engine { QueryParserEngine::PgQueryProtobuf => scan(query), QueryParserEngine::PgQueryRaw => scan_raw(query), } .map_err(Error::PgQuery)?; - let mut role = None; + let mut comment_route = CommentRoute::default(); for token in tokens.tokens.iter() { if token.token == Token::CComment as i32 { @@ -48,8 +54,8 @@ pub fn comment( if let Some(cap) = ROLE.captures(comment) { if let Some(r) = cap.get(1) { match r.as_str() { - "primary" => role = Some(Role::Primary), - "replica" => role = Some(Role::Replica), + "primary" => comment_route.role = Some(Role::Primary), + "replica" => comment_route.role = Some(Role::Replica), _ => return Err(Error::RegexError), } } @@ -57,33 +63,33 @@ pub fn comment( if let Some(cap) = SHARDING_KEY.captures(comment) { if let Some(sharding_key) = get_matched_value(&cap) { if let Some(schema) = schema.schemas.get(Some(sharding_key.into())) { - return Ok((Some(schema.shard().into()), role)); + comment_route.shard = Some(schema.shard().into()); + } else { + let ctx = ContextBuilder::infer_from_from_and_config(sharding_key, schema)? + .shards(schema.shards) + .build()?; + comment_route.shard = Some(ctx.apply()?); } - let ctx = ContextBuilder::infer_from_from_and_config(sharding_key, schema)? - .shards(schema.shards) - .build()?; - return Ok((Some(ctx.apply()?), role)); } - } - if let Some(cap) = SHARD.captures(comment) { + } else if let Some(cap) = SHARD.captures(comment) { if let Some(shard) = cap.get(1) { - return Ok(( - Some( - shard - .as_str() - .parse::() - .ok() - .map(Shard::Direct) - .unwrap_or(Shard::All), - ), - role, - )); + comment_route.shard = Some( + shard + .as_str() + .parse::() + .ok() + .map(Shard::Direct) + .unwrap_or(Shard::All), + ); } } + if let Some(_) = BACKEND.captures(comment) { + comment_route.cross_shard_backend = Some(CrossShardBackend::Fdw); + } } } - Ok((None, role)) + Ok(comment_route) } #[cfg(test)] @@ -167,7 +173,7 @@ mod tests { let query = "SELECT * FROM users /* pgdog_role: primary */"; let result = comment(query, &schema).unwrap(); - assert_eq!(result.1, Some(Role::Primary)); + assert_eq!(result.role, Some(Role::Primary)); } #[test] @@ -182,8 +188,8 @@ mod tests { let query = "SELECT * FROM users /* pgdog_role: replica pgdog_shard: 2 */"; let result = comment(query, &schema).unwrap(); - assert_eq!(result.0, Some(Shard::Direct(2))); - assert_eq!(result.1, Some(Role::Replica)); + assert_eq!(result.shard, Some(Shard::Direct(2))); + assert_eq!(result.role, Some(Role::Replica)); } #[test] @@ -198,7 +204,7 @@ mod tests { let query = "SELECT * FROM users /* pgdog_role: replica */"; let result = comment(query, &schema).unwrap(); - assert_eq!(result.1, Some(Role::Replica)); + assert_eq!(result.role, Some(Role::Replica)); } #[test] @@ -213,7 +219,7 @@ mod tests { let query = "SELECT * FROM users /* pgdog_role: invalid */"; let result = comment(query, &schema).unwrap(); - assert_eq!(result.1, None); + assert_eq!(result.role, None); } #[test] @@ -228,7 +234,14 @@ mod tests { let query = "SELECT * FROM users"; let result = comment(query, &schema).unwrap(); - assert_eq!(result.1, None); + assert_eq!(result.role, None); + } + + #[test] + fn test_fdw_fallback() { + let query = "/* pgdog_cross_shard_backend: fdw */ SELECT * FROM users"; + let result = comment(query, &ShardingSchema::default()).unwrap(); + assert_eq!(result.cross_shard_backend, Some(CrossShardBackend::Fdw)); } #[test] @@ -253,6 +266,6 @@ mod tests { let query = "SELECT * FROM users /* pgdog_sharding_key: sales */"; let result = comment(query, &schema).unwrap(); - assert_eq!(result.0, Some(Shard::Direct(1))); + assert_eq!(result.shard, Some(Shard::Direct(1))); } } diff --git a/pgdog/src/frontend/router/parser/query/fdw_fallback.rs b/pgdog/src/frontend/router/parser/query/fdw_fallback.rs index 7f1a1216..bdd85f93 100644 --- a/pgdog/src/frontend/router/parser/query/fdw_fallback.rs +++ b/pgdog/src/frontend/router/parser/query/fdw_fallback.rs @@ -2,6 +2,7 @@ //! //! Determines when a query should be sent to postgres_fdw instead of being //! executed directly by pgdog's cross-shard query engine. +#![allow(dead_code)] use pg_query::{protobuf::SelectStmt, Node, NodeEnum}; diff --git a/pgdog/src/frontend/router/parser/query/select.rs b/pgdog/src/frontend/router/parser/query/select.rs index d0665d69..d76eb1c2 100644 --- a/pgdog/src/frontend/router/parser/query/select.rs +++ b/pgdog/src/frontend/router/parser/query/select.rs @@ -2,7 +2,6 @@ use crate::frontend::router::parser::{ cache::Ast, from_clause::FromClause, where_clause::TablesSource, }; -use super::fdw_fallback::FdwFallbackContext; use super::*; use pgdog_config::system_catalogs; use shared::ConvergeAlgorithm; @@ -33,16 +32,23 @@ impl QueryParser { writes.writes = true; } + let fdw_fallback = cached_ast + .cross_shard_backend + .map(|backend| backend.need_fdw()) + .unwrap_or_default(); + // Early return for any direct-to-shard queries. if context.shards_calculator.shard().is_direct() { return Ok(Command::Query( - Route::read(context.shards_calculator.shard().clone()).with_write(writes), + Route::read(context.shards_calculator.shard().clone()) + .with_write(writes) + .with_fdw_fallback(fdw_fallback), )); } let mut shards = HashSet::new(); - let (shard, is_sharded, tables, needs_fdw_fallback) = { + let (shard, is_sharded, tables) = { let mut statement_parser = StatementParser::from_select( stmt, context.router_context.bind, @@ -51,22 +57,9 @@ impl QueryParser { ); let shard = statement_parser.shard()?; - let has_sharding_key = shard.is_some(); - - // Check FDW fallback for CTEs/subqueries - let fdw_ctx = FdwFallbackContext { - db_schema: &context.router_context.schema, - user: context.router_context.cluster.user(), - search_path: context.router_context.parameter_hints.search_path, - }; - let needs_fdw = statement_parser.needs_fdw_fallback_for_subqueries( - stmt, - &fdw_ctx, - has_sharding_key, - ); if shard.is_some() { - (shard, true, vec![], needs_fdw) + (shard, true, vec![]) } else { ( None, @@ -76,7 +69,6 @@ impl QueryParser { context.router_context.parameter_hints.search_path, ), statement_parser.extract_tables(), - needs_fdw, ) } }; @@ -214,22 +206,10 @@ impl QueryParser { // Only rewrite if query is cross-shard. if query.is_cross_shard() && context.shards > 1 { query.with_aggregate_rewrite_plan_mut(cached_ast.rewrite_plan.aggregates.clone()); - - if context.router_context.cluster.fdw_fallback_enabled() { - // Cross-shard queries with OFFSET > 0 require FDW fallback - // because OFFSET cannot be correctly applied across shards. - if limit.offset.map(|o| o > 0).unwrap_or(false) { - query.set_fdw_fallback(true); - } - - // Cross-shard queries with CTEs or subqueries that reference unsharded - // tables without a sharding key require FDW fallback. - if needs_fdw_fallback { - query.set_fdw_fallback(true); - } - } } + query.set_fdw_fallback(fdw_fallback); + Ok(Command::Query(query.with_write(writes))) } diff --git a/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs index f2823698..b7f3c362 100644 --- a/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs +++ b/pgdog/src/frontend/router/parser/query/test/test_fdw_fallback.rs @@ -2,6 +2,32 @@ use crate::net::messages::Query; use super::setup::*; +#[test] +fn test_fdw_fallback_comment() { + let mut test = QueryParserTest::new().with_fdw_fallback(); + + let command = test.execute(vec![Query::new( + "/* pgdog_cross_shard_backend: fdw */ SELECT * FROM sharded ORDER BY id LIMIT 10 OFFSET 5", + ) + .into()]); + + let route = command.route(); + assert!(route.is_fdw_fallback(),); +} + +#[test] +fn test_fdw_fallback_comment_if_direct() { + let mut test = QueryParserTest::new().with_fdw_fallback(); + + let command = test.execute(vec![Query::new( + "/* pgdog_cross_shard_backend: fdw */ SELECT * FROM sharded WHERE id = 1", + ) + .into()]); + + let route = command.route(); + assert!(route.is_fdw_fallback(),); +} + // ============================================================================= // Config verification tests // ============================================================================= @@ -9,6 +35,7 @@ use super::setup::*; /// FDW fallback should NOT be triggered when cross_shard_backend is not /// configured for FDW (default is Pgdog). #[test] +#[ignore] fn test_fdw_fallback_requires_config() { // Without with_fdw_fallback(), cross_shard_backend defaults to Pgdog let mut test = QueryParserTest::new(); @@ -34,6 +61,7 @@ fn test_fdw_fallback_requires_config() { /// because OFFSET cannot be correctly applied across shards without /// fetching all rows first. #[test] +#[ignore] fn test_cross_shard_offset_triggers_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -52,6 +80,7 @@ fn test_cross_shard_offset_triggers_fdw_fallback() { /// Cross-shard SELECT with OFFSET = 0 should NOT trigger FDW fallback #[test] +#[ignore] fn test_cross_shard_offset_zero_no_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -69,6 +98,7 @@ fn test_cross_shard_offset_zero_no_fdw_fallback() { /// Direct-to-shard SELECT with OFFSET should NOT trigger FDW fallback #[test] +#[ignore] fn test_direct_shard_offset_no_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -92,6 +122,7 @@ fn test_direct_shard_offset_no_fdw_fallback() { /// CTE that references an unsharded table without a sharding key should trigger /// FDW fallback when the main query is cross-shard. #[test] +#[ignore] fn test_cte_unsharded_table_triggers_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -112,6 +143,7 @@ fn test_cte_unsharded_table_triggers_fdw_fallback() { /// CTE that only references sharded tables should NOT trigger FDW fallback #[test] +#[ignore] fn test_cte_sharded_table_no_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -131,6 +163,7 @@ fn test_cte_sharded_table_no_fdw_fallback() { /// CTE that only references omnisharded tables should NOT trigger FDW fallback #[test] +#[ignore] fn test_cte_omnisharded_table_no_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -155,6 +188,7 @@ fn test_cte_omnisharded_table_no_fdw_fallback() { /// Subquery in FROM that references unsharded table without sharding key /// should trigger FDW fallback when main query is cross-shard. #[test] +#[ignore] fn test_subquery_unsharded_table_triggers_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -175,6 +209,7 @@ fn test_subquery_unsharded_table_triggers_fdw_fallback() { /// Subquery with correlated reference to outer sharding key should NOT trigger /// FDW fallback (inherits sharding context from outer query). #[test] +#[ignore] fn test_subquery_correlated_no_fdw_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -200,6 +235,7 @@ fn test_subquery_correlated_no_fdw_fallback() { /// Multiple CTEs where one is safe (sharded table) and one is unsafe (unsharded /// table) should trigger FDW fallback when there's no sharding key. #[test] +#[ignore] fn test_multiple_ctes_mixed_safe_unsafe_triggers_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -222,6 +258,7 @@ fn test_multiple_ctes_mixed_safe_unsafe_triggers_fallback() { /// Nested subqueries where the innermost references an unsharded table /// should trigger FDW fallback. #[test] +#[ignore] fn test_deeply_nested_subquery_unsharded_triggers_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -245,6 +282,7 @@ fn test_deeply_nested_subquery_unsharded_triggers_fallback() { /// JOIN inside a subquery mixing sharded and unsharded tables should trigger /// FDW fallback. #[test] +#[ignore] fn test_subquery_join_mixed_tables_triggers_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -267,6 +305,7 @@ fn test_subquery_join_mixed_tables_triggers_fallback() { /// OFFSET with bind parameter should trigger FDW fallback when value > 0. #[test] +#[ignore] fn test_offset_bind_parameter_triggers_fallback() { use crate::net::messages::Parameter; @@ -293,6 +332,7 @@ fn test_offset_bind_parameter_triggers_fallback() { /// Schema-qualified unsharded table should still trigger FDW fallback. #[test] +#[ignore] fn test_schema_qualified_unsharded_triggers_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -317,6 +357,7 @@ fn test_schema_qualified_unsharded_triggers_fallback() { /// Cross-shard query with window function should trigger FDW fallback /// because window functions can't be correctly merged across shards. #[test] +#[ignore] fn test_window_function_cross_shard_triggers_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -335,6 +376,7 @@ fn test_window_function_cross_shard_triggers_fallback() { /// Direct-to-shard query with window function should NOT trigger FDW fallback. #[test] +#[ignore] fn test_window_function_single_shard_no_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -353,6 +395,7 @@ fn test_window_function_single_shard_no_fallback() { /// Multiple window functions in cross-shard query should trigger FDW fallback. #[test] +#[ignore] fn test_multiple_window_functions_triggers_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); @@ -373,6 +416,7 @@ fn test_multiple_window_functions_triggers_fallback() { /// Window function in subquery should trigger FDW fallback for cross-shard. #[test] +#[ignore] fn test_window_function_in_subquery_triggers_fallback() { let mut test = QueryParserTest::new().with_fdw_fallback(); diff --git a/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs b/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs index 3d9fe323..bec0e586 100644 --- a/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs +++ b/pgdog/src/frontend/router/parser/query/test/test_subqueries.rs @@ -15,8 +15,8 @@ fn test_subquery_in_where() { assert!(command.route().is_read()); assert!( - command.route().is_fdw_fallback(), - "Subquery with unsharded table should trigger FDW fallback" + !command.route().is_fdw_fallback(), + "Subquery with unsharded table should not trigger FDW fallback" ); } @@ -74,8 +74,8 @@ fn test_subquery_with_sharding_key() { ]); assert!( - command.route().is_fdw_fallback(), - "Subquery with unsharded table should trigger FDW fallback" + !command.route().is_fdw_fallback(), + "Subquery with unsharded table should not trigger FDW fallback" ); } @@ -92,8 +92,8 @@ fn test_nested_subqueries() { assert!(command.route().is_read()); assert!( - command.route().is_fdw_fallback(), - "Nested subqueries with unsharded tables should trigger FDW fallback" + !command.route().is_fdw_fallback(), + "Nested subqueries with unsharded tables should not trigger FDW fallback" ); } diff --git a/pgdog/src/frontend/router/parser/route.rs b/pgdog/src/frontend/router/parser/route.rs index 91827d48..d3798c69 100644 --- a/pgdog/src/frontend/router/parser/route.rs +++ b/pgdog/src/frontend/router/parser/route.rs @@ -218,6 +218,11 @@ impl Route { self } + pub fn with_fdw_fallback(mut self, fdw_fallback: bool) -> Self { + self.set_fdw_fallback(fdw_fallback); + self + } + pub fn set_schema_changed(&mut self, changed: bool) { self.schema_changed = changed; } From 85da50f8cb822aec871ff76bd9cc291244aed9c6 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Tue, 17 Feb 2026 17:32:53 -0800 Subject: [PATCH 29/29] revert --- integration/pgdog.toml | 3 +-- pgdog/src/net/parameter.rs | 6 ------ 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/integration/pgdog.toml b/integration/pgdog.toml index 0fcbcc08..041d6bb2 100644 --- a/integration/pgdog.toml +++ b/integration/pgdog.toml @@ -22,8 +22,7 @@ tls_certificate = "integration/tls/cert.pem" tls_private_key = "integration/tls/key.pem" query_parser_engine = "pg_query_raw" system_catalogs = "omnisharded_sticky" -reload_schema_on_ddl = true -cross_shard_backend = "pgdog" +reload_schema_on_ddl = false [memory] net_buffer = 8096 diff --git a/pgdog/src/net/parameter.rs b/pgdog/src/net/parameter.rs index 001bb6f2..a9db66c6 100644 --- a/pgdog/src/net/parameter.rs +++ b/pgdog/src/net/parameter.rs @@ -405,12 +405,6 @@ impl Parameters { pub fn search_path(&self) -> Option<&ParameterValue> { self.get("search_path") } - - pub fn is_postgres_fdw(&self) -> bool { - self.get("pgdog.backend") - .map(|p| p.as_str() == Some("fdw")) - .unwrap_or_default() - } } impl Deref for Parameters {