From c120c8695acd62db9ae096ff3515cfe1d68e888b Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Fri, 12 Dec 2025 11:03:23 +0100
Subject: [PATCH 01/13] Added parquet encoding to Vector AWS S3 Output

---
 Cargo.lock                                    |  56 ++
 Cargo.toml                                    |   1 +
 lib/codecs/Cargo.toml                         |   2 +
 lib/codecs/src/encoding/format/arrow.rs       |   4 +-
 lib/codecs/src/encoding/format/mod.rs         |   4 +
 lib/codecs/src/encoding/format/parquet.rs     | 656 ++++++++++++++++++
 lib/codecs/src/encoding/mod.rs                |   4 +-
 lib/codecs/src/encoding/serializer.rs         |  98 ++-
 lib/vector-lib/Cargo.toml                     |   1 +
 src/codecs/encoding/config.rs                 |  23 +-
 src/codecs/encoding/encoder.rs                |  30 +-
 src/sinks/aws_s3/config.rs                    |  11 +-
 src/sinks/aws_s3/sink.rs                      |   8 +-
 src/sinks/util/encoding.rs                    |   6 +-
 website/cue/reference/components/sinks.cue    |  63 ++
 .../cue/reference/components/sinks/aws_s3.cue |  56 +-
 .../components/sinks/generated/aws_s3.cue     |  59 ++
 website/cue/reference/urls.cue                |   1 +
 18 files changed, 1026 insertions(+), 57 deletions(-)
 create mode 100644 lib/codecs/src/encoding/format/parquet.rs

diff --git a/Cargo.lock b/Cargo.lock
index dce114e7f9db1..9a22d18134240 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2607,6 +2607,7 @@ dependencies = [
  "memchr",
  "opentelemetry-proto",
  "ordered-float 4.6.0",
+ "parquet",
  "prost 0.12.6",
  "prost-reflect",
  "rand 0.9.2",
@@ -5753,6 +5754,12 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "integer-encoding"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
+
 [[package]]
 name = "inventory"
 version = "0.3.21"
@@ -7890,6 +7897,38 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
+[[package]]
+name = "parquet"
+version = "56.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27"
+dependencies = [
+ "ahash 0.8.11",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-schema",
+ "arrow-select",
+ "base64 0.22.1",
+ "brotli",
+ "bytes 1.10.1",
+ "chrono",
+ "flate2",
+ "half",
+ "hashbrown 0.16.0",
+ "lz4_flex",
+ "num",
+ "num-bigint",
+ "paste",
+ "seq-macro",
+ "snap",
+ "thrift",
+ "twox-hash",
+ "zstd 0.13.2",
+]
+
 [[package]]
 name = "parse-size"
 version = "1.1.0"
@@ -10080,6 +10119,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "seq-macro"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
+
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -11276,6 +11321,17 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "thrift"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
+dependencies = [
+ "byteorder",
+ "integer-encoding",
+ "ordered-float 2.10.1",
+]
+
 [[package]]
 name = "tikv-jemalloc-sys"
 version = "0.6.0+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7"
diff --git a/Cargo.toml b/Cargo.toml
index fa99780782eb1..aa3abbe4318fe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -584,6 +584,7 @@ enrichment-tables-memory = ["dep:evmap", "dep:evmap-derive", "dep:thread_local"]
 
 # Codecs
 codecs-arrow = ["vector-lib/arrow"]
+codecs-parquet = ["vector-lib/parquet"]
 codecs-opentelemetry = ["vector-lib/opentelemetry"]
 codecs-syslog = ["vector-lib/syslog"]
 
diff --git a/lib/codecs/Cargo.toml b/lib/codecs/Cargo.toml
index 2cb4ae3bbdb35..b35b1c73a87bb 100644
--- a/lib/codecs/Cargo.toml
+++ b/lib/codecs/Cargo.toml
@@ -15,6 +15,7 @@ path = "tests/bin/generate-avro-fixtures.rs"
 [dependencies]
 apache-avro = { version = "0.20.0", default-features = false }
 arrow = { version = "56.2.0", default-features = false, features = ["ipc"] }
+parquet = { version = "56.2.0", default-features = false, features = ["arrow", "snap", "zstd", "lz4", "brotli", "flate2", "flate2-rust_backened"], optional = true }
 bytes.workspace = true
 chrono.workspace = true
 rust_decimal = { version = "1.37", default-features = false, features = ["std"] }
@@ -59,5 +60,6 @@ vrl.workspace = true
 
 [features]
 arrow = []
+parquet = ["dep:parquet", "arrow"]
 opentelemetry = ["dep:opentelemetry-proto"]
 syslog = ["dep:syslog_loose"]
diff --git a/lib/codecs/src/encoding/format/arrow.rs b/lib/codecs/src/encoding/format/arrow.rs
index db4dc491f4cc3..0d018a9108fd8 100644
--- a/lib/codecs/src/encoding/format/arrow.rs
+++ b/lib/codecs/src/encoding/format/arrow.rs
@@ -213,7 +213,7 @@ pub fn encode_events_to_arrow_ipc_stream(
 }
 
 /// Recursively makes a Field and all its nested fields nullable
-fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field {
+pub fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field {
     let new_data_type = match field.data_type() {
         DataType::List(inner_field) => DataType::List(Arc::new(make_field_nullable(inner_field))),
         DataType::Struct(fields) => {
@@ -232,7 +232,7 @@ fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Fie
 }
 
 /// Builds an Arrow RecordBatch from events
-fn build_record_batch(
+pub fn build_record_batch(
     schema: Arc<Schema>,
     events: &[Event],
 ) -> Result<RecordBatch, ArrowEncodingError> {
diff --git a/lib/codecs/src/encoding/format/mod.rs b/lib/codecs/src/encoding/format/mod.rs
index 0d21e8b94e25c..1c0ae27a57485 100644
--- a/lib/codecs/src/encoding/format/mod.rs
+++ b/lib/codecs/src/encoding/format/mod.rs
@@ -16,6 +16,8 @@ mod native;
 mod native_json;
 #[cfg(feature = "opentelemetry")]
 mod otlp;
+#[cfg(feature = "parquet")]
+mod parquet;
 mod protobuf;
 mod raw_message;
 mod text;
@@ -34,6 +36,8 @@ pub use native::{NativeSerializer, NativeSerializerConfig};
 pub use native_json::{NativeJsonSerializer, NativeJsonSerializerConfig};
 #[cfg(feature = "opentelemetry")]
 pub use otlp::{OtlpSerializer, OtlpSerializerConfig};
+#[cfg(feature = "parquet")]
+pub use parquet::{ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig};
 pub use protobuf::{ProtobufSerializer, ProtobufSerializerConfig, ProtobufSerializerOptions};
 pub use raw_message::{RawMessageSerializer, RawMessageSerializerConfig};
 pub use text::{TextSerializer, TextSerializerConfig};
diff --git a/lib/codecs/src/encoding/format/parquet.rs b/lib/codecs/src/encoding/format/parquet.rs
new file mode 100644
index 0000000000000..9e72c7d4c89d0
--- /dev/null
+++ b/lib/codecs/src/encoding/format/parquet.rs
@@ -0,0 +1,656 @@
+//! Apache Parquet format codec for batched event encoding
+//!
+//! Provides Apache Parquet columnar file format encoding with static schema support.
+//! This encoder writes complete Parquet files with proper metadata and footers,
+//! suitable for long-term storage and analytics workloads.
+
+use arrow::datatypes::Schema;
+use bytes::{Bytes, BytesMut};
+use parquet::{
+    arrow::ArrowWriter,
+    basic::{Compression, ZstdLevel},
+    file::properties::WriterProperties,
+};
+use snafu::Snafu;
+use std::sync::Arc;
+use vector_config::configurable_component;
+
+use vector_core::event::Event;
+
+// Reuse the Arrow encoder's record batch building logic
+use super::arrow::{build_record_batch, ArrowEncodingError};
+
+/// Compression algorithm for Parquet files
+#[configurable_component]
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+#[serde(rename_all = "lowercase")]
+pub enum ParquetCompression {
+    /// No compression
+    Uncompressed,
+    /// Snappy compression (fast, moderate compression ratio)
+    #[default]
+    Snappy,
+    /// GZIP compression (slower, better compression ratio)
+    Gzip,
+    /// Brotli compression
+    Brotli,
+    /// LZ4 compression (very fast, moderate compression)
+    Lz4,
+    /// ZSTD compression (good balance of speed and compression)
+    Zstd,
+}
+
+impl From<ParquetCompression> for Compression {
+    fn from(compression: ParquetCompression) -> Self {
+        match compression {
+            ParquetCompression::Uncompressed => Compression::UNCOMPRESSED,
+            ParquetCompression::Snappy => Compression::SNAPPY,
+            ParquetCompression::Gzip => Compression::GZIP(Default::default()),
+            ParquetCompression::Brotli => Compression::BROTLI(Default::default()),
+            ParquetCompression::Lz4 => Compression::LZ4,
+            ParquetCompression::Zstd => Compression::ZSTD(ZstdLevel::default()),
+        }
+    }
+}
+
+/// Configuration for Parquet serialization
+#[configurable_component]
+#[derive(Clone, Default)]
+pub struct ParquetSerializerConfig {
+    /// The Arrow schema to use for encoding
+    ///
+    /// This schema defines the structure and types of the Parquet file columns.
+    #[serde(skip)]
+    #[configurable(derived)]
+    pub schema: Option<Arc<Schema>>,
+
+    /// Compression algorithm to use for Parquet columns
+    ///
+    /// Compression is applied to all columns in the Parquet file.
+    /// Snappy provides a good balance of speed and compression ratio.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = "snappy"))]
+    #[configurable(metadata(docs::examples = "gzip"))]
+    #[configurable(metadata(docs::examples = "zstd"))]
+    pub compression: ParquetCompression,
+
+    /// Number of rows per row group
+    ///
+    /// Row groups are Parquet's unit of parallelization. Larger row groups
+    /// can improve compression but increase memory usage during encoding.
+    /// If not specified, defaults to the batch size.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = 100000))]
+    #[configurable(metadata(docs::examples = 1000000))]
+    pub row_group_size: Option<usize>,
+
+    /// Allow null values for non-nullable fields in the schema.
+    ///
+    /// When enabled, missing or incompatible values will be encoded as null even for fields
+    /// marked as non-nullable in the Arrow schema. This is useful when working with downstream
+    /// systems that can handle null values through defaults, computed columns, or other mechanisms.
+    ///
+    /// When disabled (default), missing values for non-nullable fields will cause encoding errors,
+    /// ensuring all required data is present before writing to Parquet.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = true))]
+    pub allow_nullable_fields: bool,
+}
+
+impl std::fmt::Debug for ParquetSerializerConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ParquetSerializerConfig")
+            .field(
+                "schema",
+                &self
+                    .schema
+                    .as_ref()
+                    .map(|s| format!("{} fields", s.fields().len())),
+            )
+            .field("compression", &self.compression)
+            .field("row_group_size", &self.row_group_size)
+            .field("allow_nullable_fields", &self.allow_nullable_fields)
+            .finish()
+    }
+}
+
+impl ParquetSerializerConfig {
+    /// Create a new ParquetSerializerConfig with a schema
+    pub fn new(schema: Arc<Schema>) -> Self {
+        Self {
+            schema: Some(schema),
+            compression: ParquetCompression::default(),
+            row_group_size: None,
+            allow_nullable_fields: false,
+        }
+    }
+
+    /// The data type of events that are accepted by `ParquetSerializer`.
+    pub fn input_type(&self) -> vector_core::config::DataType {
+        vector_core::config::DataType::Log
+    }
+
+    /// The schema required by the serializer.
+    pub fn schema_requirement(&self) -> vector_core::schema::Requirement {
+        vector_core::schema::Requirement::empty()
+    }
+}
+
+/// Parquet batch serializer that holds the schema and writer configuration
+#[derive(Clone, Debug)]
+pub struct ParquetSerializer {
+    schema: Arc<Schema>,
+    writer_properties: WriterProperties,
+}
+
+impl ParquetSerializer {
+    /// Create a new ParquetSerializer with the given configuration
+    pub fn new(config: ParquetSerializerConfig) -> Result<Self, vector_common::Error> {
+        let mut schema = config.schema.ok_or_else(|| {
+            vector_common::Error::from(
+                "Parquet serializer requires a schema. Pass a schema or fetch from provider before creating serializer."
+            )
+        })?;
+
+        // If allow_nullable_fields is enabled, transform the schema once here
+        // instead of on every batch encoding
+        if config.allow_nullable_fields {
+            schema = Arc::new(Schema::new_with_metadata(
+                schema
+                    .fields()
+                    .iter()
+                    .map(|f| Arc::new(super::arrow::make_field_nullable(f)))
+                    .collect::<Vec<_>>(),
+                schema.metadata().clone(),
+            ));
+        }
+
+        // Build writer properties
+        let mut props_builder = WriterProperties::builder()
+            .set_compression(config.compression.into());
+
+        if let Some(row_group_size) = config.row_group_size {
+            props_builder = props_builder.set_max_row_group_size(row_group_size);
+        }
+
+        let writer_properties = props_builder.build();
+
+        Ok(Self {
+            schema,
+            writer_properties,
+        })
+    }
+}
+
+impl tokio_util::codec::Encoder<Vec<Event>> for ParquetSerializer {
+    type Error = ParquetEncodingError;
+
+    fn encode(&mut self, events: Vec<Event>, buffer: &mut BytesMut) -> Result<(), Self::Error> {
+        if events.is_empty() {
+            return Err(ParquetEncodingError::NoEvents);
+        }
+
+        let bytes = encode_events_to_parquet(&events, Arc::clone(&self.schema), &self.writer_properties)?;
+
+        buffer.extend_from_slice(&bytes);
+        Ok(())
+    }
+}
+
+/// Errors that can occur during Parquet encoding
+#[derive(Debug, Snafu)]
+pub enum ParquetEncodingError {
+    /// Failed to build Arrow record batch
+    #[snafu(display("Failed to build Arrow record batch: {}", source))]
+    RecordBatchCreation {
+        /// The underlying Arrow encoding error
+        source: ArrowEncodingError,
+    },
+
+    /// Failed to write Parquet data
+    #[snafu(display("Failed to write Parquet data: {}", source))]
+    ParquetWrite {
+        /// The underlying Parquet error
+        source: parquet::errors::ParquetError,
+    },
+
+    /// No events provided for encoding
+    #[snafu(display("No events provided for encoding"))]
+    NoEvents,
+
+    /// Schema must be provided before encoding
+    #[snafu(display("Schema must be provided before encoding"))]
+    NoSchemaProvided,
+
+    /// IO error during encoding
+    #[snafu(display("IO error: {}", source))]
+    Io {
+        /// The underlying IO error
+        source: std::io::Error,
+    },
+}
+
+impl From<std::io::Error> for ParquetEncodingError {
+    fn from(error: std::io::Error) -> Self {
+        Self::Io { source: error }
+    }
+}
+
+impl From<ArrowEncodingError> for ParquetEncodingError {
+    fn from(error: ArrowEncodingError) -> Self {
+        Self::RecordBatchCreation { source: error }
+    }
+}
+
+impl From<parquet::errors::ParquetError> for ParquetEncodingError {
+    fn from(error: parquet::errors::ParquetError) -> Self {
+        Self::ParquetWrite { source: error }
+    }
+}
+
+/// Encodes a batch of events into Parquet format
+pub fn encode_events_to_parquet(
+    events: &[Event],
+    schema: Arc<Schema>,
+    writer_properties: &WriterProperties,
+) -> Result<Bytes, ParquetEncodingError> {
+    if events.is_empty() {
+        return Err(ParquetEncodingError::NoEvents);
+    }
+
+    // Build Arrow RecordBatch from events (reuses Arrow encoder logic)
+    let record_batch = build_record_batch(schema, events)?;
+
+    // Write RecordBatch to Parquet format in memory
+    let mut buffer = Vec::new();
+    {
+        let mut writer = ArrowWriter::try_new(
+            &mut buffer,
+            record_batch.schema(),
+            Some(writer_properties.clone()),
+        )?;
+
+        writer.write(&record_batch)?;
+        writer.close()?;
+    }
+
+    Ok(Bytes::from(buffer))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::{
+        array::{
+            Array, BinaryArray, BooleanArray, Float64Array, Int64Array, StringArray,
+            TimestampMicrosecondArray,
+        },
+        datatypes::{DataType, Field, TimeUnit},
+    };
+    use bytes::Bytes;
+    use chrono::Utc;
+    use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+    use vector_core::event::LogEvent;
+
+    #[test]
+    fn test_encode_all_types() {
+        let mut log = LogEvent::default();
+        log.insert("string_field", "test");
+        log.insert("int64_field", 42);
+        log.insert("float64_field", 3.15);
+        log.insert("bool_field", true);
+        log.insert("bytes_field", bytes::Bytes::from("binary"));
+        log.insert("timestamp_field", Utc::now());
+
+        let events = vec![Event::Log(log)];
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("string_field", DataType::Utf8, true),
+            Field::new("int64_field", DataType::Int64, true),
+            Field::new("float64_field", DataType::Float64, true),
+            Field::new("bool_field", DataType::Boolean, true),
+            Field::new("bytes_field", DataType::Binary, true),
+            Field::new(
+                "timestamp_field",
+                DataType::Timestamp(TimeUnit::Microsecond, None),
+                true,
+            ),
+        ]));
+
+        let props = WriterProperties::builder()
+            .set_compression(Compression::SNAPPY)
+            .build();
+
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+        assert!(result.is_ok());
+
+        let bytes = result.unwrap();
+
+        // Verify it's valid Parquet by reading it back
+        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let batches: Vec<_> = reader.collect::<Result<_, _>>().unwrap();
+        assert_eq!(batches.len(), 1);
+
+        let batch = &batches[0];
+        assert_eq!(batch.num_rows(), 1);
+        assert_eq!(batch.num_columns(), 6);
+
+        // Verify string field
+        assert_eq!(
+            batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .unwrap()
+                .value(0),
+            "test"
+        );
+
+        // Verify int64 field
+        assert_eq!(
+            batch
+                .column(1)
+                .as_any()
+                .downcast_ref::<Int64Array>()
+                .unwrap()
+                .value(0),
+            42
+        );
+
+        // Verify float64 field
+        assert!(
+            (batch
+                .column(2)
+                .as_any()
+                .downcast_ref::<Float64Array>()
+                .unwrap()
+                .value(0)
+                - 3.15)
+                .abs()
+                < 0.001
+        );
+
+        // Verify boolean field
+        assert!(
+            batch
+                .column(3)
+                .as_any()
+                .downcast_ref::<BooleanArray>()
+                .unwrap()
+                .value(0)
+        );
+
+        // Verify binary field
+        assert_eq!(
+            batch
+                .column(4)
+                .as_any()
+                .downcast_ref::<BinaryArray>()
+                .unwrap()
+                .value(0),
+            b"binary"
+        );
+
+        // Verify timestamp field
+        assert!(
+            !batch
+                .column(5)
+                .as_any()
+                .downcast_ref::<TimestampMicrosecondArray>()
+                .unwrap()
+                .is_null(0)
+        );
+    }
+
+    #[test]
+    fn test_encode_null_values() {
+        let mut log1 = LogEvent::default();
+        log1.insert("field_a", 1);
+        // field_b is missing
+
+        let mut log2 = LogEvent::default();
+        log2.insert("field_b", 2);
+        // field_a is missing
+
+        let events = vec![Event::Log(log1), Event::Log(log2)];
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("field_a", DataType::Int64, true),
+            Field::new("field_b", DataType::Int64, true),
+        ]));
+
+        let props = WriterProperties::builder().build();
+
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+        assert!(result.is_ok());
+
+        let bytes = result.unwrap();
+        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let batches: Vec<_> = reader.collect::<Result<_, _>>().unwrap();
+        let batch = &batches[0];
+
+        assert_eq!(batch.num_rows(), 2);
+
+        let field_a = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        assert_eq!(field_a.value(0), 1);
+        assert!(field_a.is_null(1));
+
+        let field_b = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        assert!(field_b.is_null(0));
+        assert_eq!(field_b.value(1), 2);
+    }
+
+    #[test]
+    fn test_encode_empty_events() {
+        let events: Vec<Event> = vec![];
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "field",
+            DataType::Int64,
+            true,
+        )]));
+        let props = WriterProperties::builder().build();
+        let result = encode_events_to_parquet(&events, schema, &props);
+        assert!(result.is_err());
+        assert!(matches!(result.unwrap_err(), ParquetEncodingError::NoEvents));
+    }
+
+    #[test]
+    fn test_parquet_compression_types() {
+        let mut log = LogEvent::default();
+        log.insert("message", "test message");
+
+        let events = vec![Event::Log(log)];
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "message",
+            DataType::Utf8,
+            true,
+        )]));
+
+        // Test different compression algorithms
+        let compressions = vec![
+            ParquetCompression::Uncompressed,
+            ParquetCompression::Snappy,
+            ParquetCompression::Gzip,
+            ParquetCompression::Zstd,
+        ];
+
+        for compression in compressions {
+            let props = WriterProperties::builder()
+                .set_compression(compression.into())
+                .build();
+
+            let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+            assert!(result.is_ok(), "Failed with compression: {:?}", compression);
+
+            // Verify we can read it back
+            let bytes = result.unwrap();
+            let reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
+                .unwrap()
+                .build()
+                .unwrap();
+
+            let batches: Vec<_> = reader.collect::<Result<_, _>>().unwrap();
+            assert_eq!(batches[0].num_rows(), 1);
+        }
+    }
+
+    #[test]
+    fn test_parquet_serializer_config() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "field",
+            DataType::Int64,
+            true,
+        )]));
+
+        let config = ParquetSerializerConfig {
+            schema: Some(schema),
+            compression: ParquetCompression::Zstd,
+            row_group_size: Some(1000),
+            allow_nullable_fields: false,
+        };
+
+        let serializer = ParquetSerializer::new(config);
+        assert!(serializer.is_ok());
+    }
+
+    #[test]
+    fn test_parquet_serializer_no_schema_fails() {
+        let config = ParquetSerializerConfig {
+            schema: None,
+            compression: ParquetCompression::default(),
+            row_group_size: None,
+            allow_nullable_fields: false,
+        };
+
+        let result = ParquetSerializer::new(config);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_encoder_trait_implementation() {
+        use tokio_util::codec::Encoder;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+
+        let config = ParquetSerializerConfig::new(schema);
+        let mut serializer = ParquetSerializer::new(config).unwrap();
+
+        let mut log = LogEvent::default();
+        log.insert("id", 1);
+        log.insert("name", "test");
+
+        let events = vec![Event::Log(log)];
+        let mut buffer = BytesMut::new();
+
+        let result = serializer.encode(events, &mut buffer);
+        assert!(result.is_ok());
+        assert!(!buffer.is_empty());
+
+        // Verify the buffer contains valid Parquet data
+        let bytes = Bytes::copy_from_slice(&buffer);
+        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes);
+        assert!(reader.is_ok());
+    }
+
+    #[test]
+    fn test_large_batch_encoding() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("value", DataType::Float64, true),
+        ]));
+
+        // Create 10,000 events
+        let events: Vec<Event> = (0..10000)
+            .map(|i| {
+                let mut log = LogEvent::default();
+                log.insert("id", i);
+                log.insert("value", i as f64 * 1.5);
+                Event::Log(log)
+            })
+            .collect();
+
+        let props = WriterProperties::builder()
+            .set_compression(Compression::SNAPPY)
+            .set_max_row_group_size(5000) // 2 row groups
+            .build();
+
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+        assert!(result.is_ok());
+
+        let bytes = result.unwrap();
+        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let batches: Vec<_> = reader.collect::<Result<_, _>>().unwrap();
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 10000);
+    }
+
+    #[test]
+    fn test_allow_nullable_fields_config() {
+        use tokio_util::codec::Encoder;
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "required_field",
+            DataType::Int64,
+            false, // Non-nullable
+        )]));
+
+        let mut log1 = LogEvent::default();
+        log1.insert("required_field", 42);
+
+        let log2 = LogEvent::default();
+        // log2 is missing required_field
+
+        let events = vec![Event::Log(log1), Event::Log(log2)];
+
+        // With allow_nullable_fields = true, should succeed
+        let mut config = ParquetSerializerConfig::new(Arc::clone(&schema));
+        config.allow_nullable_fields = true;
+
+        let mut serializer = ParquetSerializer::new(config).unwrap();
+        let mut buffer = BytesMut::new();
+        let result = serializer.encode(events.clone(), &mut buffer);
+        assert!(result.is_ok());
+
+        // Verify the data
+        let bytes = Bytes::copy_from_slice(&buffer);
+        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let batches: Vec<_> = reader.collect::<Result<_, _>>().unwrap();
+        let batch = &batches[0];
+
+        let array = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+
+        assert_eq!(array.value(0), 42);
+        assert!(array.is_null(1));
+    }
+}
diff --git a/lib/codecs/src/encoding/mod.rs b/lib/codecs/src/encoding/mod.rs
index 3fe0baafa8b91..22b853336e2d7 100644
--- a/lib/codecs/src/encoding/mod.rs
+++ b/lib/codecs/src/encoding/mod.rs
@@ -17,6 +17,8 @@ pub use format::{
     ProtobufSerializerOptions, RawMessageSerializer, RawMessageSerializerConfig, TextSerializer,
     TextSerializerConfig,
 };
+#[cfg(feature = "parquet")]
+pub use format::{ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig};
 #[cfg(feature = "opentelemetry")]
 pub use format::{OtlpSerializer, OtlpSerializerConfig};
 pub use framing::{
@@ -26,7 +28,7 @@ pub use framing::{
     NewlineDelimitedEncoderConfig, VarintLengthDelimitedEncoder,
     VarintLengthDelimitedEncoderConfig,
 };
-#[cfg(feature = "arrow")]
+#[cfg(any(feature = "arrow", feature = "parquet"))]
 pub use serializer::BatchSerializerConfig;
 pub use serializer::{Serializer, SerializerConfig};
 
diff --git a/lib/codecs/src/encoding/serializer.rs b/lib/codecs/src/encoding/serializer.rs
index 899e03d60e4ec..1982f6af43ef0 100644
--- a/lib/codecs/src/encoding/serializer.rs
+++ b/lib/codecs/src/encoding/serializer.rs
@@ -1,13 +1,17 @@
 //! Serializer configuration and implementation for encoding structured events as bytes.
 
 use bytes::BytesMut;
+#[cfg(feature = "parquet")]
+use vector_common::Error as VectorError;
 use vector_config::configurable_component;
 use vector_core::{config::DataType, event::Event, schema};
 
 #[cfg(feature = "arrow")]
-use super::format::{ArrowStreamSerializer, ArrowStreamSerializerConfig};
+use super::format::ArrowStreamSerializerConfig;
 #[cfg(feature = "opentelemetry")]
 use super::format::{OtlpSerializer, OtlpSerializerConfig};
+#[cfg(feature = "parquet")]
+use super::format::ParquetSerializerConfig;
 use super::{
     chunking::Chunker,
     format::{
@@ -110,6 +114,16 @@ pub enum SerializerConfig {
     /// [protobuf]: https://protobuf.dev/
     Protobuf(ProtobufSerializerConfig),
 
+    /// Encodes events in [Apache Parquet][apache_parquet] columnar format.
+    ///
+    /// Parquet is a columnar storage format optimized for analytics workloads.
+    /// It provides efficient compression and encoding schemes, making it ideal
+    /// for long-term storage and query performance.
+    ///
+    /// [apache_parquet]: https://parquet.apache.org/
+    #[cfg(feature = "parquet")]
+    Parquet(ParquetSerializerConfig),
+
     /// No encoding.
     ///
     /// This encoding uses the `message` field of a log event.
@@ -153,32 +167,37 @@ pub enum BatchSerializerConfig {
     #[cfg(feature = "arrow")]
     #[serde(rename = "arrow_stream")]
     ArrowStream(ArrowStreamSerializerConfig),
+
+    /// Encodes events in [Apache Parquet][apache_parquet] columnar format.
+    ///
+    /// Parquet is a columnar storage format optimized for analytics workloads.
+    /// It provides efficient compression and encoding schemes, making it ideal
+    /// for long-term storage and query performance.
+    ///
+    /// [apache_parquet]: https://parquet.apache.org/
+    #[cfg(feature = "parquet")]
+    Parquet(ParquetSerializerConfig),
 }
 
-#[cfg(feature = "arrow")]
+#[cfg(any(feature = "arrow", feature = "parquet"))]
 impl BatchSerializerConfig {
-    /// Build the `ArrowStreamSerializer` from this configuration.
-    pub fn build(
-        &self,
-    ) -> Result<ArrowStreamSerializer, Box<dyn std::error::Error + Send + Sync + 'static>> {
-        match self {
-            BatchSerializerConfig::ArrowStream(arrow_config) => {
-                ArrowStreamSerializer::new(arrow_config.clone())
-            }
-        }
-    }
-
     /// The data type of events that are accepted by this batch serializer.
     pub fn input_type(&self) -> DataType {
         match self {
+            #[cfg(feature = "arrow")]
             BatchSerializerConfig::ArrowStream(arrow_config) => arrow_config.input_type(),
+            #[cfg(feature = "parquet")]
+            BatchSerializerConfig::Parquet(parquet_config) => parquet_config.input_type(),
         }
     }
 
     /// The schema required by the batch serializer.
     pub fn schema_requirement(&self) -> schema::Requirement {
         match self {
+            #[cfg(feature = "arrow")]
             BatchSerializerConfig::ArrowStream(arrow_config) => arrow_config.schema_requirement(),
+            #[cfg(feature = "parquet")]
+            BatchSerializerConfig::Parquet(parquet_config) => parquet_config.schema_requirement(),
         }
     }
 }
@@ -281,6 +300,13 @@ impl SerializerConfig {
                 Ok(Serializer::RawMessage(RawMessageSerializerConfig.build()))
             }
             SerializerConfig::Text(config) => Ok(Serializer::Text(config.build())),
+            #[cfg(feature = "parquet")]
+            SerializerConfig::Parquet(_) => Err(
+                VectorError::from(
+                    "Parquet codec is available only for batch encoding and cannot be built as a framed serializer.",
+                )
+                .into(),
+            ),
         }
     }
 
@@ -316,6 +342,8 @@ impl SerializerConfig {
             SerializerConfig::Gelf(_) => {
                 FramingConfig::CharacterDelimited(CharacterDelimitedEncoderConfig::new(0))
             }
+            #[cfg(feature = "parquet")]
+            SerializerConfig::Parquet(_) => FramingConfig::NewlineDelimited,
         }
     }
 
@@ -330,15 +358,17 @@ impl SerializerConfig {
             SerializerConfig::Gelf(config) => config.input_type(),
             SerializerConfig::Json(config) => config.input_type(),
             SerializerConfig::Logfmt => LogfmtSerializerConfig.input_type(),
-            SerializerConfig::Native => NativeSerializerConfig.input_type(),
-            SerializerConfig::NativeJson => NativeJsonSerializerConfig.input_type(),
-            #[cfg(feature = "opentelemetry")]
-            SerializerConfig::Otlp => OtlpSerializerConfig::default().input_type(),
-            SerializerConfig::Protobuf(config) => config.input_type(),
-            SerializerConfig::RawMessage => RawMessageSerializerConfig.input_type(),
-            SerializerConfig::Text(config) => config.input_type(),
-        }
+        SerializerConfig::Native => NativeSerializerConfig.input_type(),
+        SerializerConfig::NativeJson => NativeJsonSerializerConfig.input_type(),
+        #[cfg(feature = "opentelemetry")]
+        SerializerConfig::Otlp => OtlpSerializerConfig::default().input_type(),
+        SerializerConfig::Protobuf(config) => config.input_type(),
+        #[cfg(feature = "parquet")]
+        SerializerConfig::Parquet(config) => config.input_type(),
+        SerializerConfig::RawMessage => RawMessageSerializerConfig.input_type(),
+        SerializerConfig::Text(config) => config.input_type(),
     }
+}
 
     /// The schema required by the serializer.
     pub fn schema_requirement(&self) -> schema::Requirement {
@@ -352,15 +382,17 @@ impl SerializerConfig {
             SerializerConfig::Json(config) => config.schema_requirement(),
             SerializerConfig::Logfmt => LogfmtSerializerConfig.schema_requirement(),
             SerializerConfig::Native => NativeSerializerConfig.schema_requirement(),
-            SerializerConfig::NativeJson => NativeJsonSerializerConfig.schema_requirement(),
-            #[cfg(feature = "opentelemetry")]
-            SerializerConfig::Otlp => OtlpSerializerConfig::default().schema_requirement(),
-            SerializerConfig::Protobuf(config) => config.schema_requirement(),
-            SerializerConfig::RawMessage => RawMessageSerializerConfig.schema_requirement(),
-            SerializerConfig::Text(config) => config.schema_requirement(),
-        }
+        SerializerConfig::NativeJson => NativeJsonSerializerConfig.schema_requirement(),
+        #[cfg(feature = "opentelemetry")]
+        SerializerConfig::Otlp => OtlpSerializerConfig::default().schema_requirement(),
+        SerializerConfig::Protobuf(config) => config.schema_requirement(),
+        #[cfg(feature = "parquet")]
+        SerializerConfig::Parquet(config) => config.schema_requirement(),
+        SerializerConfig::RawMessage => RawMessageSerializerConfig.schema_requirement(),
+        SerializerConfig::Text(config) => config.schema_requirement(),
     }
 }
+}
 
 /// Serialize structured events as bytes.
 #[derive(Debug, Clone)]
@@ -418,11 +450,11 @@ impl Serializer {
     /// if you need to determine the capability to encode to JSON at runtime.
     pub fn to_json_value(&self, event: Event) -> Result<serde_json::Value, vector_common::Error> {
         match self {
-            Serializer::Gelf(serializer) => serializer.to_json_value(event),
-            Serializer::Json(serializer) => serializer.to_json_value(event),
-            Serializer::NativeJson(serializer) => serializer.to_json_value(event),
-            Serializer::Avro(_)
-            | Serializer::Cef(_)
+        Serializer::Gelf(serializer) => serializer.to_json_value(event),
+        Serializer::Json(serializer) => serializer.to_json_value(event),
+        Serializer::NativeJson(serializer) => serializer.to_json_value(event),
+        Serializer::Avro(_)
+        | Serializer::Cef(_)
             | Serializer::Csv(_)
             | Serializer::Logfmt(_)
             | Serializer::Text(_)
diff --git a/lib/vector-lib/Cargo.toml b/lib/vector-lib/Cargo.toml
index c72af97fdaa62..35bd661ddb372 100644
--- a/lib/vector-lib/Cargo.toml
+++ b/lib/vector-lib/Cargo.toml
@@ -27,6 +27,7 @@ vrl = { workspace = true, optional = true }
 allocation-tracing = ["vector-top?/allocation-tracing"]
 api-client = ["dep:vector-api-client"]
 arrow = ["codecs/arrow"]
+parquet = ["codecs/parquet"]
 api = ["vector-tap/api"]
 file-source = ["dep:file-source", "dep:file-source-common"]
 lua = ["vector-core/lua"]
diff --git a/src/codecs/encoding/config.rs b/src/codecs/encoding/config.rs
index 255db45b538ed..f1c6a8bcad78e 100644
--- a/src/codecs/encoding/config.rs
+++ b/src/codecs/encoding/config.rs
@@ -1,4 +1,8 @@
+#[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
+use crate::codecs::{BatchEncoder, BatchSerializer};
 use crate::codecs::{Encoder, EncoderKind, Transformer};
+#[cfg(feature = "codecs-parquet")]
+use vector_lib::codecs::encoding::ParquetSerializer;
 use vector_lib::{
     codecs::{
         CharacterDelimitedEncoder, LengthDelimitedEncoder, NewlineDelimitedEncoder,
@@ -141,9 +145,22 @@ impl EncodingConfigWithFraming {
 
     /// Build the `Transformer` and `EncoderKind` for this config.
     pub fn build_encoder(&self, sink_type: SinkType) -> crate::Result<(Transformer, EncoderKind)> {
-        let (framer, serializer) = self.build(sink_type)?;
-        let encoder = EncoderKind::Framed(Box::new(Encoder::<Framer>::new(framer, serializer)));
-        Ok((self.transformer(), encoder))
+        match &self.encoding.encoding {
+            #[cfg(feature = "codecs-parquet")]
+            SerializerConfig::Parquet(parquet_config) => {
+                let serializer = ParquetSerializer::new(parquet_config.clone())?;
+                let encoder = EncoderKind::Batch(BatchEncoder::new(BatchSerializer::Parquet(
+                    serializer,
+                )));
+                Ok((self.transformer(), encoder))
+            }
+            _ => {
+                let (framer, serializer) = self.build(sink_type)?;
+                let encoder =
+                    EncoderKind::Framed(Box::new(Encoder::<Framer>::new(framer, serializer)));
+                Ok((self.transformer(), encoder))
+            }
+        }
     }
 }
 
diff --git a/src/codecs/encoding/encoder.rs b/src/codecs/encoding/encoder.rs
index 333c29b4840cf..cd276ac039550 100644
--- a/src/codecs/encoding/encoder.rs
+++ b/src/codecs/encoding/encoder.rs
@@ -2,6 +2,8 @@ use bytes::BytesMut;
 use tokio_util::codec::Encoder as _;
 #[cfg(feature = "codecs-arrow")]
 use vector_lib::codecs::encoding::ArrowStreamSerializer;
+#[cfg(feature = "codecs-parquet")]
+use vector_lib::codecs::encoding::ParquetSerializer;
 use vector_lib::codecs::{
     CharacterDelimitedEncoder, NewlineDelimitedEncoder, TextSerializerConfig,
     encoding::{Error, Framer, Serializer},
@@ -18,6 +20,9 @@ pub enum BatchSerializer {
     /// Arrow IPC stream format serializer.
     #[cfg(feature = "codecs-arrow")]
     Arrow(ArrowStreamSerializer),
+    /// Parquet columnar format serializer.
+    #[cfg(feature = "codecs-parquet")]
+    Parquet(ParquetSerializer),
 }
 
 /// An encoder that encodes batches of events.
@@ -38,10 +43,13 @@ impl BatchEncoder {
     }
 
     /// Get the HTTP content type.
-    #[cfg(feature = "codecs-arrow")]
+    #[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
     pub const fn content_type(&self) -> &'static str {
         match &self.serializer {
+            #[cfg(feature = "codecs-arrow")]
             BatchSerializer::Arrow(_) => "application/vnd.apache.arrow.stream",
+            #[cfg(feature = "codecs-parquet")]
+            BatchSerializer::Parquet(_) => "application/vnd.apache.parquet",
         }
     }
 }
@@ -65,6 +73,24 @@ impl tokio_util::codec::Encoder<Vec<Event>> for BatchEncoder {
                     }
                 })
             }
+            #[cfg(feature = "codecs-parquet")]
+            BatchSerializer::Parquet(serializer) => {
+                serializer.encode(events, buffer).map_err(|err| {
+                    use vector_lib::codecs::encoding::ParquetEncodingError;
+                    match err {
+                        ParquetEncodingError::RecordBatchCreation { source } => {
+                            use vector_lib::codecs::encoding::ArrowEncodingError;
+                            match source {
+                                ArrowEncodingError::NullConstraint { .. } => {
+                                    Error::SchemaConstraintViolation(Box::new(err))
+                                }
+                                _ => Error::SerializingError(Box::new(err)),
+                            }
+                        }
+                        _ => Error::SerializingError(Box::new(err)),
+                    }
+                })
+            }
             _ => unreachable!("BatchSerializer cannot be constructed without encode()"),
         }
     }
@@ -76,7 +102,7 @@ pub enum EncoderKind {
     /// Uses framing to encode individual events
     Framed(Box<Encoder<Framer>>),
     /// Encodes events in batches without framing
-    #[cfg(feature = "codecs-arrow")]
+    #[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
     Batch(BatchEncoder),
 }
 
diff --git a/src/sinks/aws_s3/config.rs b/src/sinks/aws_s3/config.rs
index 08ba92245d65c..6a0f51098e8aa 100644
--- a/src/sinks/aws_s3/config.rs
+++ b/src/sinks/aws_s3/config.rs
@@ -2,10 +2,7 @@ use aws_sdk_s3::Client as S3Client;
 use tower::ServiceBuilder;
 use vector_lib::{
     TimeZone,
-    codecs::{
-        TextSerializerConfig,
-        encoding::{Framer, FramingConfig},
-    },
+    codecs::{TextSerializerConfig, encoding::FramingConfig},
     configurable::configurable_component,
     sink::VectorSink,
 };
@@ -13,7 +10,7 @@ use vector_lib::{
 use super::sink::S3RequestOptions;
 use crate::{
     aws::{AwsAuthentication, RegionOrEndpoint},
-    codecs::{Encoder, EncodingConfigWithFraming, SinkType},
+    codecs::{EncodingConfigWithFraming, SinkType},
     config::{AcknowledgementsConfig, GenerateConfig, Input, ProxyConfig, SinkConfig, SinkContext},
     sinks::{
         Healthcheck,
@@ -245,9 +242,7 @@ impl S3SinkConfig {
 
         let partitioner = S3KeyPartitioner::new(key_prefix, ssekms_key_id, None);
 
-        let transformer = self.encoding.transformer();
-        let (framer, serializer) = self.encoding.build(SinkType::MessageBased)?;
-        let encoder = Encoder::<Framer>::new(framer, serializer);
+        let (transformer, encoder) = self.encoding.build_encoder(SinkType::MessageBased)?;
 
         let request_options = S3RequestOptions {
             bucket: self.bucket.clone(),
diff --git a/src/sinks/aws_s3/sink.rs b/src/sinks/aws_s3/sink.rs
index 26d47cdb7039c..e99e5634aafac 100644
--- a/src/sinks/aws_s3/sink.rs
+++ b/src/sinks/aws_s3/sink.rs
@@ -3,10 +3,10 @@ use std::io;
 use bytes::Bytes;
 use chrono::{FixedOffset, Utc};
 use uuid::Uuid;
-use vector_lib::{codecs::encoding::Framer, event::Finalizable, request_metadata::RequestMetadata};
+use vector_lib::{event::Finalizable, request_metadata::RequestMetadata};
 
 use crate::{
-    codecs::{Encoder, Transformer},
+    codecs::{EncoderKind, Transformer},
     event::Event,
     sinks::{
         s3_common::{
@@ -28,7 +28,7 @@ pub struct S3RequestOptions {
     pub filename_append_uuid: bool,
     pub filename_extension: Option<String>,
     pub api_options: S3Options,
-    pub encoder: (Transformer, Encoder<Framer>),
+    pub encoder: (Transformer, EncoderKind),
     pub compression: Compression,
     pub filename_tz_offset: Option<FixedOffset>,
 }
@@ -36,7 +36,7 @@ pub struct S3RequestOptions {
 impl RequestBuilder<(S3PartitionKey, Vec<Event>)> for S3RequestOptions {
     type Metadata = S3Metadata;
     type Events = Vec<Event>;
-    type Encoder = (Transformer, Encoder<Framer>);
+    type Encoder = (Transformer, EncoderKind);
     type Payload = Bytes;
     type Request = S3Request;
     type Error = io::Error; // TODO: this is ugly.
diff --git a/src/sinks/util/encoding.rs b/src/sinks/util/encoding.rs
index 6265021ef6f1a..39162ed3065b6 100644
--- a/src/sinks/util/encoding.rs
+++ b/src/sinks/util/encoding.rs
@@ -8,7 +8,7 @@ use vector_lib::{
     request_metadata::GroupedCountByteSize,
 };
 
-#[cfg(feature = "codecs-arrow")]
+#[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
 use crate::internal_events::EncoderNullConstraintError;
 use crate::{codecs::Transformer, event::Event, internal_events::EncoderWriteError};
 
@@ -99,7 +99,7 @@ impl Encoder<Event> for (Transformer, crate::codecs::Encoder<()>) {
     }
 }
 
-#[cfg(feature = "codecs-arrow")]
+#[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
 impl Encoder<Vec<Event>> for (Transformer, crate::codecs::BatchEncoder) {
     fn encode_input(
         &self,
@@ -150,7 +150,7 @@ impl Encoder<Vec<Event>> for (Transformer, crate::codecs::EncoderKind) {
             crate::codecs::EncoderKind::Framed(encoder) => {
                 (self.0.clone(), *encoder.clone()).encode_input(events, writer)
             }
-            #[cfg(feature = "codecs-arrow")]
+            #[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
             crate::codecs::EncoderKind::Batch(encoder) => {
                 (self.0.clone(), encoder.clone()).encode_input(events, writer)
             }
diff --git a/website/cue/reference/components/sinks.cue b/website/cue/reference/components/sinks.cue
index cb399721a8c0b..7ebcf691e30bf 100644
--- a/website/cue/reference/components/sinks.cue
+++ b/website/cue/reference/components/sinks.cue
@@ -188,6 +188,20 @@ components: sinks: [Name=string]: {
 														[apache_avro]: https://avro.apache.org/
 														"""
 												}
+												if codec == "parquet" {
+													parquet: """
+														Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+														Parquet is a columnar storage format optimized for analytics workloads. It provides
+														efficient compression and encoding schemes, making it ideal for long-term storage and
+														query performance with tools like AWS Athena, Apache Spark, and Presto.
+
+														This is a batch encoder that encodes multiple events at once into a single Parquet file.
+														Each batch of events becomes one Parquet file with proper metadata and footers.
+
+														[apache_parquet]: https://parquet.apache.org/
+														"""
+												}
 											}
 										}
 									}
@@ -216,6 +230,55 @@ components: sinks: [Name=string]: {
 												}
 											}
 										}
+										if codec == "parquet" {
+											parquet: {
+												description:   "Apache Parquet-specific encoder options."
+												required:      false
+												relevant_when: "codec = `parquet`"
+												type: object: options: {
+													compression: {
+														description: "Compression algorithm for Parquet columns."
+														required:    false
+														type: string: {
+															default: "snappy"
+															enum: {
+																snappy:       "Snappy compression (fast, moderate compression ratio)"
+																gzip:         "GZIP compression (balanced, good for AWS Athena)"
+																zstd:         "ZSTD compression (best compression ratio)"
+																lz4:          "LZ4 compression (very fast)"
+																brotli:       "Brotli compression (good compression)"
+																uncompressed: "No compression"
+															}
+														}
+													}
+													row_group_size: {
+														description: """
+															Number of rows per row group.
+
+															Row groups are Parquet's unit of parallelization. Larger row groups can improve
+															compression but increase memory usage during encoding. If not specified, defaults
+															to the batch size.
+															"""
+														required: false
+														type: uint: {
+															default: null
+															examples: [100000, 1000000]
+														}
+													}
+													allow_nullable_fields: {
+														description: """
+															Allow null values for non-nullable fields in the schema.
+
+															When enabled, missing or incompatible values will be encoded as null even for fields
+															marked as non-nullable in the schema. This is useful when working with downstream
+															systems that can handle null values through defaults or computed columns.
+															"""
+														required: false
+														type: bool: default: false
+													}
+												}
+											}
+										}
 									}
 								}
 
diff --git a/website/cue/reference/components/sinks/aws_s3.cue b/website/cue/reference/components/sinks/aws_s3.cue
index cec5a50e47f82..3bc440ee0a0f7 100644
--- a/website/cue/reference/components/sinks/aws_s3.cue
+++ b/website/cue/reference/components/sinks/aws_s3.cue
@@ -34,7 +34,7 @@ components: sinks: aws_s3: components._aws & {
 				codec: {
 					enabled: true
 					framing: true
-					enum: ["json", "text"]
+					enum: ["json", "text", "parquet"]
 				}
 			}
 			proxy: enabled: true
@@ -103,6 +103,60 @@ components: sinks: aws_s3: components._aws & {
 				"""
 		}
 
+		parquet_encoding: {
+			title: "Parquet encoding"
+			body:  """
+				The AWS S3 sink supports encoding events in [Apache Parquet](\(urls.apache_parquet))
+				format, which is a columnar storage format optimized for analytics workloads. Parquet
+				provides efficient compression and encoding schemes, making it ideal for long-term
+				storage and query performance with tools like AWS Athena, Apache Spark, and Presto.
+
+				When using Parquet encoding, you must specify a schema that defines the structure and
+				types of the Parquet file columns. Vector events are converted to Arrow RecordBatches
+				and then written as Parquet files.
+
+				**Supported Parquet compression algorithms:**
+				- `snappy` (default): Fast compression with moderate compression ratio
+				- `gzip`: Balanced compression, good for AWS Athena compatibility
+				- `zstd`: Best compression ratio, ideal for cold storage
+				- `lz4`: Very fast compression, good for high-throughput scenarios
+				- `brotli`: Good compression, web-optimized
+				- `uncompressed`: No compression
+
+				**Example configuration:**
+
+				```yaml
+				encoding:
+				  codec: parquet
+				  schema:
+				    timestamp: timestamp_micros
+				    user_id: utf8
+				    action: utf8
+				    value: int64
+				    duration_ms: float64
+				  compression: snappy
+				  row_group_size: 100000
+				  allow_nullable_fields: true
+				```
+
+				**Supported data types:**
+				- Strings: `utf8`
+				- Integers: `int8`, `int16`, `int32`, `int64`
+				- Unsigned integers: `uint8`, `uint16`, `uint32`, `uint64`
+				- Floats: `float32`, `float64`
+				- Timestamps: `timestamp_second`, `timestamp_millisecond`, `timestamp_microsecond`, `timestamp_nanosecond`
+				- Boolean: `boolean`
+				- Binary: `binary`
+				- Decimals: `decimal128`, `decimal256`
+
+				**Note:** When using Parquet encoding, set `compression: none` at the sink level since
+				Parquet handles compression internally through its columnar compression algorithms.
+
+				Each batch of events becomes one Parquet file in S3, with the batch size controlled by
+				the `batch.max_events`, `batch.max_bytes`, and `batch.timeout_secs` settings.
+				"""
+		}
+
 		log_on_put: {
 			title: "Emit a log when putting an object"
 			body: """
diff --git a/website/cue/reference/components/sinks/generated/aws_s3.cue b/website/cue/reference/components/sinks/generated/aws_s3.cue
index f17b2abf1e74f..733368b35cdbd 100644
--- a/website/cue/reference/components/sinks/generated/aws_s3.cue
+++ b/website/cue/reference/components/sinks/generated/aws_s3.cue
@@ -497,6 +497,17 @@ generated: components: sinks: aws_s3: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads. It provides
+						efficient compression and encoding schemes, making it ideal for long-term storage and
+						query performance with tools like AWS Athena, Apache Spark, and Presto.
+
+						This is a batch encoder that writes one Parquet file per batch with proper metadata and footers.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -660,6 +671,53 @@ generated: components: sinks: aws_s3: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      false
+				type: object: options: {
+					compression: {
+						description: "Compression algorithm for Parquet columns."
+						required:    false
+						type: string: {
+							default: "snappy"
+							enum: {
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								gzip:         "GZIP compression (balanced, good for AWS Athena)"
+								zstd:         "ZSTD compression (best compression ratio)"
+								lz4:          "LZ4 compression (very fast)"
+								brotli:       "Brotli compression (good compression)"
+								uncompressed: "No compression"
+							}
+						}
+					}
+					row_group_size: {
+						description: """
+							Number of rows per row group.
+
+							Row groups are Parquet's unit of parallelization. Larger row groups can improve
+							compression but increase memory usage during encoding. If not specified, defaults
+							to the batch size.
+							"""
+						required: false
+						type: uint: {
+							default: null
+							examples: [100000, 1000000]
+						}
+					}
+					allow_nullable_fields: {
+						description: """
+							Allow null values for non-nullable fields in the schema.
+
+							When enabled, missing or incompatible values will be encoded as null even for fields
+							marked as non-nullable in the schema. This is useful when working with downstream
+							systems that can handle null values through defaults or computed columns.
+							"""
+						required: false
+						type: bool: default: false
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
@@ -738,6 +796,7 @@ generated: components: sinks: aws_s3: configuration: {
 		required: false
 		type: string: examples: [
 			"json",
+			"parquet",
 		]
 	}
 	filename_time_format: {
diff --git a/website/cue/reference/urls.cue b/website/cue/reference/urls.cue
index 2578c0b38c60a..3a56000c49fed 100644
--- a/website/cue/reference/urls.cue
+++ b/website/cue/reference/urls.cue
@@ -19,6 +19,7 @@ urls: {
 	apache_extended_status:                     "\(apache)/docs/current/mod/core.html#extendedstatus"
 	apache_install:                             "\(apache)/docs/current/install.html"
 	apache_mod_status:                          "http://httpd.apache.org/docs/current/mod/mod_status.html"
+	apache_parquet:                             "https://parquet.apache.org/"
 	apt:                                        "\(wikipedia)/wiki/APT_(software)"
 	arm:                                        "\(wikipedia)/wiki/ARM_architecture"
 	aws_access_keys:                            "\(aws_docs)/IAM/latest/UserGuide/id_credentials_access-keys.html"

From 1b630aa9fcb7dfa711862d531324e00c12c34029 Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Fri, 12 Dec 2025 12:59:00 +0100
Subject: [PATCH 02/13] Added schema def

---
 lib/codecs/src/encoding/format/mod.rs         |   4 +
 lib/codecs/src/encoding/format/parquet.rs     |  80 ++++---
 .../src/encoding/format/schema_definition.rs  | 201 ++++++++++++++++++
 lib/codecs/src/encoding/mod.rs                |   2 +
 src/codecs/encoding/encoder.rs                |   2 +-
 src/components/validation/resources/mod.rs    |   2 +
 src/internal_events/codecs.rs                 |   4 +-
 src/sinks/aws_s3/config.rs                    |   2 +-
 .../cue/reference/components/sinks/aws_s3.cue | 154 +++++++++++---
 9 files changed, 380 insertions(+), 71 deletions(-)
 create mode 100644 lib/codecs/src/encoding/format/schema_definition.rs

diff --git a/lib/codecs/src/encoding/format/mod.rs b/lib/codecs/src/encoding/format/mod.rs
index 1c0ae27a57485..92ede1f3590c8 100644
--- a/lib/codecs/src/encoding/format/mod.rs
+++ b/lib/codecs/src/encoding/format/mod.rs
@@ -20,6 +20,8 @@ mod otlp;
 mod parquet;
 mod protobuf;
 mod raw_message;
+#[cfg(any(feature = "arrow", feature = "parquet"))]
+mod schema_definition;
 mod text;
 
 use std::fmt::Debug;
@@ -40,6 +42,8 @@ pub use otlp::{OtlpSerializer, OtlpSerializerConfig};
 pub use parquet::{ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig};
 pub use protobuf::{ProtobufSerializer, ProtobufSerializerConfig, ProtobufSerializerOptions};
 pub use raw_message::{RawMessageSerializer, RawMessageSerializerConfig};
+#[cfg(any(feature = "arrow", feature = "parquet"))]
+pub use schema_definition::{SchemaDefinition, SchemaDefinitionError};
 pub use text::{TextSerializer, TextSerializerConfig};
 use vector_core::event::Event;
 
diff --git a/lib/codecs/src/encoding/format/parquet.rs b/lib/codecs/src/encoding/format/parquet.rs
index 9e72c7d4c89d0..e5a1b802b870e 100644
--- a/lib/codecs/src/encoding/format/parquet.rs
+++ b/lib/codecs/src/encoding/format/parquet.rs
@@ -19,6 +19,7 @@ use vector_core::event::Event;
 
 // Reuse the Arrow encoder's record batch building logic
 use super::arrow::{build_record_batch, ArrowEncodingError};
+use super::schema_definition::SchemaDefinition;
 
 /// Compression algorithm for Parquet files
 #[configurable_component]
@@ -57,12 +58,17 @@ impl From<ParquetCompression> for Compression {
 #[configurable_component]
 #[derive(Clone, Default)]
 pub struct ParquetSerializerConfig {
-    /// The Arrow schema to use for encoding
+    /// The Arrow schema definition to use for encoding
     ///
     /// This schema defines the structure and types of the Parquet file columns.
-    #[serde(skip)]
-    #[configurable(derived)]
-    pub schema: Option<Arc<Schema>>,
+    /// Specified as a map of field names to data types.
+    ///
+    /// Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+    /// float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+    /// timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = "schema_example()"))]
+    pub schema: Option<SchemaDefinition>,
 
     /// Compression algorithm to use for Parquet columns
     ///
@@ -78,6 +84,9 @@ pub struct ParquetSerializerConfig {
     ///
     /// Row groups are Parquet's unit of parallelization. Larger row groups
     /// can improve compression but increase memory usage during encoding.
+    ///
+    /// Since each batch becomes a separate Parquet file, this value
+    /// should be <= the batch max_events setting. Row groups cannot span multiple files.
     /// If not specified, defaults to the batch size.
     #[serde(default)]
     #[configurable(metadata(docs::examples = 100000))]
@@ -97,16 +106,18 @@ pub struct ParquetSerializerConfig {
     pub allow_nullable_fields: bool,
 }
 
+fn schema_example() -> std::collections::BTreeMap<String, String> {
+    let mut map = std::collections::BTreeMap::new();
+    map.insert("id".to_string(), "int64".to_string());
+    map.insert("name".to_string(), "utf8".to_string());
+    map.insert("timestamp".to_string(), "timestamp_microsecond".to_string());
+    map
+}
+
 impl std::fmt::Debug for ParquetSerializerConfig {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("ParquetSerializerConfig")
-            .field(
-                "schema",
-                &self
-                    .schema
-                    .as_ref()
-                    .map(|s| format!("{} fields", s.fields().len())),
-            )
+            .field("schema", &self.schema.is_some())
             .field("compression", &self.compression)
             .field("row_group_size", &self.row_group_size)
             .field("allow_nullable_fields", &self.allow_nullable_fields)
@@ -115,8 +126,8 @@ impl std::fmt::Debug for ParquetSerializerConfig {
 }
 
 impl ParquetSerializerConfig {
-    /// Create a new ParquetSerializerConfig with a schema
-    pub fn new(schema: Arc<Schema>) -> Self {
+    /// Create a new ParquetSerializerConfig with a schema definition
+    pub fn new(schema: SchemaDefinition) -> Self {
         Self {
             schema: Some(schema),
             compression: ParquetCompression::default(),
@@ -146,12 +157,17 @@ pub struct ParquetSerializer {
 impl ParquetSerializer {
     /// Create a new ParquetSerializer with the given configuration
     pub fn new(config: ParquetSerializerConfig) -> Result<Self, vector_common::Error> {
-        let mut schema = config.schema.ok_or_else(|| {
+        let schema_def = config.schema.ok_or_else(|| {
             vector_common::Error::from(
-                "Parquet serializer requires a schema. Pass a schema or fetch from provider before creating serializer."
+                "Parquet serializer requires a schema. Specify 'schema' in the configuration."
             )
         })?;
 
+        // Convert SchemaDefinition to Arrow Schema
+        let mut schema = schema_def
+            .to_arrow_schema()
+            .map_err(|e| vector_common::Error::from(e.to_string()))?;
+
         // If allow_nullable_fields is enabled, transform the schema once here
         // instead of on every batch encoding
         if config.allow_nullable_fields {
@@ -512,14 +528,13 @@ mod tests {
 
     #[test]
     fn test_parquet_serializer_config() {
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            "field",
-            DataType::Int64,
-            true,
-        )]));
+        use std::collections::BTreeMap;
+
+        let mut schema_map = BTreeMap::new();
+        schema_map.insert("field".to_string(), "int64".to_string());
 
         let config = ParquetSerializerConfig {
-            schema: Some(schema),
+            schema: Some(SchemaDefinition::Simple(schema_map)),
             compression: ParquetCompression::Zstd,
             row_group_size: Some(1000),
             allow_nullable_fields: false,
@@ -544,14 +559,14 @@ mod tests {
 
     #[test]
     fn test_encoder_trait_implementation() {
+        use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
 
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int64, true),
-            Field::new("name", DataType::Utf8, true),
-        ]));
+        let mut schema_map = BTreeMap::new();
+        schema_map.insert("id".to_string(), "int64".to_string());
+        schema_map.insert("name".to_string(), "utf8".to_string());
 
-        let config = ParquetSerializerConfig::new(schema);
+        let config = ParquetSerializerConfig::new(SchemaDefinition::Simple(schema_map));
         let mut serializer = ParquetSerializer::new(config).unwrap();
 
         let mut log = LogEvent::default();
@@ -609,13 +624,11 @@ mod tests {
 
     #[test]
     fn test_allow_nullable_fields_config() {
+        use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
 
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            "required_field",
-            DataType::Int64,
-            false, // Non-nullable
-        )]));
+        let mut schema_map = BTreeMap::new();
+        schema_map.insert("required_field".to_string(), "int64".to_string());
 
         let mut log1 = LogEvent::default();
         log1.insert("required_field", 42);
@@ -625,8 +638,9 @@ mod tests {
 
         let events = vec![Event::Log(log1), Event::Log(log2)];
 
-        // With allow_nullable_fields = true, should succeed
-        let mut config = ParquetSerializerConfig::new(Arc::clone(&schema));
+        // Note: SchemaDefinition creates nullable fields by default
+        // This test verifies that the allow_nullable_fields flag works
+        let mut config = ParquetSerializerConfig::new(SchemaDefinition::Simple(schema_map));
         config.allow_nullable_fields = true;
 
         let mut serializer = ParquetSerializer::new(config).unwrap();
diff --git a/lib/codecs/src/encoding/format/schema_definition.rs b/lib/codecs/src/encoding/format/schema_definition.rs
new file mode 100644
index 0000000000000..0c98788907f7a
--- /dev/null
+++ b/lib/codecs/src/encoding/format/schema_definition.rs
@@ -0,0 +1,201 @@
+//! Schema definition support for Arrow and Parquet encoders
+
+use std::{collections::BTreeMap, sync::Arc};
+
+use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+#[allow(unused_imports)] // Used by vector_config macros
+use serde::{Deserialize, Serialize};
+use snafu::Snafu;
+use vector_config::configurable_component;
+
+/// Error type for schema definition parsing
+#[derive(Debug, Snafu)]
+pub enum SchemaDefinitionError {
+    /// Unknown data type specified in schema
+    #[snafu(display("Unknown data type '{}' for field '{}'", data_type, field_name))]
+    UnknownDataType {
+        /// The field name that had an unknown type
+        field_name: String,
+        /// The unknown data type string
+        data_type: String,
+    },
+}
+
+/// A schema definition that can be deserialized from configuration
+#[configurable_component]
+#[derive(Debug, Clone)]
+#[serde(untagged)]
+pub enum SchemaDefinition {
+    /// Simple map of field names to type names
+    Simple(BTreeMap<String, String>),
+}
+
+impl SchemaDefinition {
+    /// Convert the schema definition to an Arrow Schema
+    pub fn to_arrow_schema(&self) -> Result<Arc<Schema>, SchemaDefinitionError> {
+        match self {
+            SchemaDefinition::Simple(fields) => {
+                let arrow_fields: Result<Vec<_>, _> = fields
+                    .iter()
+                    .map(|(name, type_str)| {
+                        let data_type = parse_data_type(type_str, name)?;
+                        // All fields are nullable by default when defined in config
+                        Ok(Arc::new(Field::new(name, data_type, true)))
+                    })
+                    .collect();
+
+                Ok(Arc::new(Schema::new(arrow_fields?)))
+            }
+        }
+    }
+}
+
+/// Parse a data type string into an Arrow DataType
+fn parse_data_type(
+    type_str: &str,
+    field_name: &str,
+) -> Result<DataType, SchemaDefinitionError> {
+    let data_type = match type_str.to_lowercase().as_str() {
+        // String types
+        "utf8" | "string" => DataType::Utf8,
+        "large_utf8" | "large_string" => DataType::LargeUtf8,
+
+        // Integer types
+        "int8" => DataType::Int8,
+        "int16" => DataType::Int16,
+        "int32" => DataType::Int32,
+        "int64" => DataType::Int64,
+
+        // Unsigned integer types
+        "uint8" => DataType::UInt8,
+        "uint16" => DataType::UInt16,
+        "uint32" => DataType::UInt32,
+        "uint64" => DataType::UInt64,
+
+        // Floating point types
+        "float32" | "float" => DataType::Float32,
+        "float64" | "double" => DataType::Float64,
+
+        // Boolean
+        "boolean" | "bool" => DataType::Boolean,
+
+        // Binary types
+        "binary" => DataType::Binary,
+        "large_binary" => DataType::LargeBinary,
+
+        // Timestamp types
+        "timestamp_second" | "timestamp_s" => {
+            DataType::Timestamp(TimeUnit::Second, None)
+        }
+        "timestamp_millisecond" | "timestamp_ms" | "timestamp_millis" => {
+            DataType::Timestamp(TimeUnit::Millisecond, None)
+        }
+        "timestamp_microsecond" | "timestamp_us" | "timestamp_micros" => {
+            DataType::Timestamp(TimeUnit::Microsecond, None)
+        }
+        "timestamp_nanosecond" | "timestamp_ns" | "timestamp_nanos" => {
+            DataType::Timestamp(TimeUnit::Nanosecond, None)
+        }
+
+        // Date types
+        "date32" | "date" => DataType::Date32,
+        "date64" => DataType::Date64,
+
+        // Time types
+        "time32_second" | "time32_s" => DataType::Time32(TimeUnit::Second),
+        "time32_millisecond" | "time32_ms" => DataType::Time32(TimeUnit::Millisecond),
+        "time64_microsecond" | "time64_us" => DataType::Time64(TimeUnit::Microsecond),
+        "time64_nanosecond" | "time64_ns" => DataType::Time64(TimeUnit::Nanosecond),
+
+        // Duration types
+        "duration_second" | "duration_s" => DataType::Duration(TimeUnit::Second),
+        "duration_millisecond" | "duration_ms" => DataType::Duration(TimeUnit::Millisecond),
+        "duration_microsecond" | "duration_us" => DataType::Duration(TimeUnit::Microsecond),
+        "duration_nanosecond" | "duration_ns" => DataType::Duration(TimeUnit::Nanosecond),
+
+        // Decimal types
+        "decimal128" => DataType::Decimal128(38, 10), // Default precision and scale
+        "decimal256" => DataType::Decimal256(76, 10), // Default precision and scale
+
+        // Unknown type
+        _ => {
+            return Err(SchemaDefinitionError::UnknownDataType {
+                field_name: field_name.to_string(),
+                data_type: type_str.to_string(),
+            })
+        }
+    };
+
+    Ok(data_type)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_simple_schema_definition() {
+        let mut fields = BTreeMap::new();
+        fields.insert("id".to_string(), "int64".to_string());
+        fields.insert("name".to_string(), "utf8".to_string());
+        fields.insert("value".to_string(), "float64".to_string());
+
+        let schema_def = SchemaDefinition::Simple(fields);
+        let schema = schema_def.to_arrow_schema().unwrap();
+
+        assert_eq!(schema.fields().len(), 3);
+
+        let id_field = schema.field_with_name("id").unwrap();
+        assert_eq!(id_field.data_type(), &DataType::Int64);
+        assert!(id_field.is_nullable());
+
+        let name_field = schema.field_with_name("name").unwrap();
+        assert_eq!(name_field.data_type(), &DataType::Utf8);
+
+        let value_field = schema.field_with_name("value").unwrap();
+        assert_eq!(value_field.data_type(), &DataType::Float64);
+    }
+
+    #[test]
+    fn test_timestamp_types() {
+        let mut fields = BTreeMap::new();
+        fields.insert("ts_s".to_string(), "timestamp_second".to_string());
+        fields.insert("ts_ms".to_string(), "timestamp_millisecond".to_string());
+        fields.insert("ts_us".to_string(), "timestamp_microsecond".to_string());
+        fields.insert("ts_ns".to_string(), "timestamp_nanosecond".to_string());
+
+        let schema_def = SchemaDefinition::Simple(fields);
+        let schema = schema_def.to_arrow_schema().unwrap();
+
+        assert_eq!(
+            schema.field_with_name("ts_s").unwrap().data_type(),
+            &DataType::Timestamp(TimeUnit::Second, None)
+        );
+        assert_eq!(
+            schema.field_with_name("ts_ms").unwrap().data_type(),
+            &DataType::Timestamp(TimeUnit::Millisecond, None)
+        );
+        assert_eq!(
+            schema.field_with_name("ts_us").unwrap().data_type(),
+            &DataType::Timestamp(TimeUnit::Microsecond, None)
+        );
+        assert_eq!(
+            schema.field_with_name("ts_ns").unwrap().data_type(),
+            &DataType::Timestamp(TimeUnit::Nanosecond, None)
+        );
+    }
+
+    #[test]
+    fn test_unknown_data_type() {
+        let mut fields = BTreeMap::new();
+        fields.insert("bad_field".to_string(), "unknown_type".to_string());
+
+        let schema_def = SchemaDefinition::Simple(fields);
+        let result = schema_def.to_arrow_schema();
+
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(err.to_string().contains("unknown_type"));
+    }
+
+}
diff --git a/lib/codecs/src/encoding/mod.rs b/lib/codecs/src/encoding/mod.rs
index 22b853336e2d7..320a0ab9c1296 100644
--- a/lib/codecs/src/encoding/mod.rs
+++ b/lib/codecs/src/encoding/mod.rs
@@ -21,6 +21,8 @@ pub use format::{
 pub use format::{ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig};
 #[cfg(feature = "opentelemetry")]
 pub use format::{OtlpSerializer, OtlpSerializerConfig};
+#[cfg(any(feature = "arrow", feature = "parquet"))]
+pub use format::{SchemaDefinition, SchemaDefinitionError};
 pub use framing::{
     BoxedFramer, BoxedFramingError, BytesEncoder, BytesEncoderConfig, CharacterDelimitedEncoder,
     CharacterDelimitedEncoderConfig, CharacterDelimitedEncoderOptions, Framer, FramingConfig,
diff --git a/src/codecs/encoding/encoder.rs b/src/codecs/encoding/encoder.rs
index cd276ac039550..a88a2275911b8 100644
--- a/src/codecs/encoding/encoder.rs
+++ b/src/codecs/encoding/encoder.rs
@@ -77,7 +77,7 @@ impl tokio_util::codec::Encoder<Vec<Event>> for BatchEncoder {
             BatchSerializer::Parquet(serializer) => {
                 serializer.encode(events, buffer).map_err(|err| {
                     use vector_lib::codecs::encoding::ParquetEncodingError;
-                    match err {
+                    match &err {
                         ParquetEncodingError::RecordBatchCreation { source } => {
                             use vector_lib::codecs::encoding::ArrowEncodingError;
                             match source {
diff --git a/src/components/validation/resources/mod.rs b/src/components/validation/resources/mod.rs
index 67c22910e07b0..f72116111c93d 100644
--- a/src/components/validation/resources/mod.rs
+++ b/src/components/validation/resources/mod.rs
@@ -239,6 +239,8 @@ fn serializer_config_to_deserializer(
         SerializerConfig::RawMessage | SerializerConfig::Text(_) => DeserializerConfig::Bytes,
         #[cfg(feature = "codecs-opentelemetry")]
         SerializerConfig::Otlp => todo!(),
+        #[cfg(feature = "codecs-parquet")]
+        SerializerConfig::Parquet(_) => DeserializerConfig::Bytes, // Parquet files are binary
     };
 
     deserializer_config.build()
diff --git a/src/internal_events/codecs.rs b/src/internal_events/codecs.rs
index 27980af51b799..c6b2671eec568 100644
--- a/src/internal_events/codecs.rs
+++ b/src/internal_events/codecs.rs
@@ -137,13 +137,13 @@ impl<E: std::fmt::Display> InternalEvent for EncoderWriteError<'_, E> {
     }
 }
 
-#[cfg(feature = "codecs-arrow")]
+#[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
 #[derive(Debug, NamedInternalEvent)]
 pub struct EncoderNullConstraintError<'a> {
     pub error: &'a crate::Error,
 }
 
-#[cfg(feature = "codecs-arrow")]
+#[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
 impl InternalEvent for EncoderNullConstraintError<'_> {
     fn emit(self) {
         const CONSTRAINT_REASON: &str = "Schema constraint violation.";
diff --git a/src/sinks/aws_s3/config.rs b/src/sinks/aws_s3/config.rs
index 6a0f51098e8aa..03c7ca8295e28 100644
--- a/src/sinks/aws_s3/config.rs
+++ b/src/sinks/aws_s3/config.rs
@@ -278,7 +278,7 @@ impl S3SinkConfig {
 
 #[cfg(test)]
 mod tests {
-    use super::S3SinkConfig;
+    use super::*;
 
     #[test]
     fn generate_config() {
diff --git a/website/cue/reference/components/sinks/aws_s3.cue b/website/cue/reference/components/sinks/aws_s3.cue
index 3bc440ee0a0f7..4ada477060448 100644
--- a/website/cue/reference/components/sinks/aws_s3.cue
+++ b/website/cue/reference/components/sinks/aws_s3.cue
@@ -111,49 +111,135 @@ components: sinks: aws_s3: components._aws & {
 				provides efficient compression and encoding schemes, making it ideal for long-term
 				storage and query performance with tools like AWS Athena, Apache Spark, and Presto.
 
-				When using Parquet encoding, you must specify a schema that defines the structure and
-				types of the Parquet file columns. Vector events are converted to Arrow RecordBatches
-				and then written as Parquet files.
+				## Schema Configuration
 
-				**Supported Parquet compression algorithms:**
+				When using Parquet encoding, you **must** specify a schema that defines the structure and
+				types of the Parquet file columns. The schema is defined as a simple map of field names to
+				data types. Vector events are converted to Arrow RecordBatches and then written as Parquet files.
+
+				All fields defined in the schema are nullable by default, meaning missing fields will be encoded
+				as NULL values in the Parquet file.
+
+				**Example configuration:**
+
+				```yaml
+				sinks:
+				  s3:
+				    type: aws_s3
+				    bucket: my-bucket
+				    compression: none  # Parquet handles compression internally
+				    batch:
+				      max_events: 50000
+				      timeout_secs: 60
+				    encoding:
+				      codec: parquet
+				      schema:
+				        # Timestamps
+				        timestamp: timestamp_microsecond
+				        created_at: timestamp_millisecond
+
+				        # String fields
+				        user_id: utf8
+				        event_name: utf8
+				        message: utf8
+
+				        # Numeric fields
+				        team_id: int64
+				        duration_ms: float64
+				        count: int32
+
+				        # Boolean
+				        is_active: boolean
+
+				      parquet:
+				        compression: zstd
+				        row_group_size: 50000  # Should be <= batch.max_events
+				        allow_nullable_fields: true
+				```
+
+				## Supported Data Types
+
+				The following data types are supported for Parquet schema fields:
+
+				**String types:**
+				- `utf8` or `string`: UTF-8 encoded strings
+				- `large_utf8` or `large_string`: Large UTF-8 strings (>2GB)
+
+				**Integer types:**
+				- `int8`, `int16`, `int32`, `int64`: Signed integers
+				- `uint8`, `uint16`, `uint32`, `uint64`: Unsigned integers
+
+				**Floating point types:**
+				- `float32` or `float`: 32-bit floating point
+				- `float64` or `double`: 64-bit floating point
+
+				**Timestamp types:**
+				- `timestamp_second` or `timestamp_s`: Seconds since Unix epoch
+				- `timestamp_millisecond`, `timestamp_ms`, or `timestamp_millis`: Milliseconds since Unix epoch
+				- `timestamp_microsecond`, `timestamp_us`, or `timestamp_micros`: Microseconds since Unix epoch
+				- `timestamp_nanosecond`, `timestamp_ns`, or `timestamp_nanos`: Nanoseconds since Unix epoch
+
+				**Date types:**
+				- `date32` or `date`: Days since Unix epoch (32-bit)
+				- `date64`: Milliseconds since Unix epoch (64-bit)
+
+				**Other types:**
+				- `boolean` or `bool`: Boolean values
+				- `binary`: Arbitrary binary data
+				- `large_binary`: Large binary data (>2GB)
+				- `decimal128`: 128-bit decimal with default precision
+				- `decimal256`: 256-bit decimal with default precision
+
+				## Parquet Configuration Options
+
+				### compression
+
+				Compression algorithm applied to Parquet column data:
 				- `snappy` (default): Fast compression with moderate compression ratio
-				- `gzip`: Balanced compression, good for AWS Athena compatibility
+				- `gzip`: Balanced compression, excellent AWS Athena compatibility
 				- `zstd`: Best compression ratio, ideal for cold storage
 				- `lz4`: Very fast compression, good for high-throughput scenarios
 				- `brotli`: Good compression, web-optimized
 				- `uncompressed`: No compression
 
-				**Example configuration:**
+				### row_group_size
 
-				```yaml
-				encoding:
-				  codec: parquet
-				  schema:
-				    timestamp: timestamp_micros
-				    user_id: utf8
-				    action: utf8
-				    value: int64
-				    duration_ms: float64
-				  compression: snappy
-				  row_group_size: 100000
-				  allow_nullable_fields: true
-				```
+				Number of rows per row group in the Parquet file. Row groups are Parquet's unit of
+				parallelization - query engines can read different row groups in parallel.
+
+				**Important:** Since each batch becomes a separate Parquet file, `row_group_size` should
+				be less than or equal to `batch.max_events`. Row groups cannot span multiple files.
+				If omitted, defaults to the batch size.
+
+				**Trade-offs:**
+				- **Larger row groups** (500K-1M rows): Better compression, less query parallelism
+				- **Smaller row groups** (50K-100K rows): More query parallelism, slightly worse compression
+
+				For AWS Athena, row groups of 128-256 MB (uncompressed) are often recommended.
+
+				### allow_nullable_fields
+
+				When enabled, missing or incompatible values will be encoded as NULL even for fields that
+				would normally be non-nullable. This is useful when working with downstream systems that
+				can handle NULL values through defaults or computed columns.
+
+				## Batching Behavior
+
+				Each batch of events becomes **one Parquet file** in S3. The batch size is controlled by:
+				- `batch.max_events`: Maximum number of events per file
+				- `batch.max_bytes`: Maximum bytes per file
+				- `batch.timeout_secs`: Maximum time to wait before flushing
+
+				Example: With `max_events: 50000`, each Parquet file will contain up to 50,000 rows.
+
+				## Important Notes
 
-				**Supported data types:**
-				- Strings: `utf8`
-				- Integers: `int8`, `int16`, `int32`, `int64`
-				- Unsigned integers: `uint8`, `uint16`, `uint32`, `uint64`
-				- Floats: `float32`, `float64`
-				- Timestamps: `timestamp_second`, `timestamp_millisecond`, `timestamp_microsecond`, `timestamp_nanosecond`
-				- Boolean: `boolean`
-				- Binary: `binary`
-				- Decimals: `decimal128`, `decimal256`
-
-				**Note:** When using Parquet encoding, set `compression: none` at the sink level since
-				Parquet handles compression internally through its columnar compression algorithms.
-
-				Each batch of events becomes one Parquet file in S3, with the batch size controlled by
-				the `batch.max_events`, `batch.max_bytes`, and `batch.timeout_secs` settings.
+				- **Sink-level compression**: Set `compression: none` at the sink level since Parquet
+				  handles compression internally through its `parquet.compression` setting
+				- **All fields nullable**: Fields defined in the schema are nullable by default, allowing
+				  for missing values
+				- **Schema required**: The schema cannot be inferred and must be explicitly configured
+				- **AWS Athena compatibility**: Use `gzip` compression for best Athena compatibility
 				"""
 		}
 

From 5a7f6bf4b17fba4c826f22b5d5b3463eec05e589 Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Fri, 12 Dec 2025 13:45:15 +0100
Subject: [PATCH 03/13] Added parquet to default features

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index aa3abbe4318fe..d3ddb734a4b14 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -495,7 +495,7 @@ ntapi = { git = "https://github.com/MSxDOS/ntapi.git", rev = "24fc1e47677fc9f6e3
 
 [features]
 # Default features for *-unknown-linux-gnu and *-apple-darwin
-default = ["api", "api-client", "enrichment-tables", "sinks", "sources", "sources-dnstap", "transforms", "unix", "rdkafka?/gssapi-vendored", "secrets"]
+default = ["api", "api-client", "codecs-parquet", "enrichment-tables", "sinks", "sources", "sources-dnstap", "transforms", "unix", "rdkafka?/gssapi-vendored", "secrets"]
 # Default features for `cargo docs`. The same as `default` but without `rdkafka?/gssapi-vendored` which would require installing libsasl in our doc build environment.
 docs = ["api", "api-client", "enrichment-tables", "sinks", "sources", "sources-dnstap", "transforms", "unix", "secrets"]
 # Default features for *-unknown-linux-* which make use of `cmake` for dependencies

From 52d4c8e4f031239d54e8d8d6c279787c783362b8 Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Fri, 12 Dec 2025 14:06:53 +0100
Subject: [PATCH 04/13] Added changelog item for parquet

---
 changelog.d/parquet_encoder_aws_s3.feature.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 changelog.d/parquet_encoder_aws_s3.feature.md

diff --git a/changelog.d/parquet_encoder_aws_s3.feature.md b/changelog.d/parquet_encoder_aws_s3.feature.md
new file mode 100644
index 0000000000000..6a2e921f0d595
--- /dev/null
+++ b/changelog.d/parquet_encoder_aws_s3.feature.md
@@ -0,0 +1,18 @@
+The `aws_s3` sink now supports [Apache Parquet](https://parquet.apache.org/) encoding, enabling
+Vector to write columnar Parquet files optimized for analytics workloads.
+
+Parquet is a columnar storage format that provides efficient compression and encoding schemes,
+making it ideal for long-term storage and query performance with tools like AWS Athena, Apache Spark,
+and Presto. Users can now configure Parquet encoding with custom schemas defined directly in YAML
+as a simple map of field names to data types.
+
+Features include:
+- Support for all common data types: strings (utf8), integers (int8-int64), unsigned integers,
+  floats (float32, float64), timestamps (second/millisecond/microsecond/nanosecond precision),
+  booleans, binary data, and decimals
+- Configurable compression algorithms: snappy (default), gzip, zstd, lz4, brotli, or uncompressed
+
+Each batch of events becomes one Parquet file in S3, with batch size controlled by the standard
+`batch.max_events`, `batch.max_bytes`, and `batch.timeout_secs` settings.
+
+authors: rorylshanks

From 672999df00d27dc423710d837ace5914ebcb9469 Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Sat, 13 Dec 2025 23:51:19 +0100
Subject: [PATCH 05/13] Pre-allocate buffer for parquet output based on
 herustic 2kb per event - can be tuned in config

---
 lib/codecs/src/encoding/format/parquet.rs     | 74 ++++++++++++++++---
 .../cue/reference/components/sinks/aws_s3.cue | 23 ++++++
 .../components/sinks/generated/aws_s3.cue     | 20 +++++
 3 files changed, 106 insertions(+), 11 deletions(-)

diff --git a/lib/codecs/src/encoding/format/parquet.rs b/lib/codecs/src/encoding/format/parquet.rs
index e5a1b802b870e..1315a859afab4 100644
--- a/lib/codecs/src/encoding/format/parquet.rs
+++ b/lib/codecs/src/encoding/format/parquet.rs
@@ -5,7 +5,7 @@
 //! suitable for long-term storage and analytics workloads.
 
 use arrow::datatypes::Schema;
-use bytes::{Bytes, BytesMut};
+use bytes::{Bytes, BytesMut, BufMut};
 use parquet::{
     arrow::ArrowWriter,
     basic::{Compression, ZstdLevel},
@@ -104,6 +104,22 @@ pub struct ParquetSerializerConfig {
     #[serde(default)]
     #[configurable(metadata(docs::examples = true))]
     pub allow_nullable_fields: bool,
+
+    /// Estimated compressed output size in bytes for buffer pre-allocation.
+    ///
+    /// Pre-allocating the output buffer based on expected compressed size significantly
+    /// reduces memory overhead by avoiding repeated reallocations during encoding.
+    /// If not specified, defaults to a heuristic based on estimated uncompressed size.
+    ///
+    /// Guidelines for setting this value:
+    /// - Monitor actual compressed output sizes in production
+    /// - Set to ~1.2x your average observed compressed size for headroom
+    /// - ZSTD typically achieves 3-10x compression on JSON data
+    /// - Example: If batches are 100MB uncompressed and compress to 10MB, set to ~12MB
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = 10485760))]  // 10MB
+    #[configurable(metadata(docs::examples = 52428800))]  // 50MB
+    pub estimated_output_size: Option<usize>,
 }
 
 fn schema_example() -> std::collections::BTreeMap<String, String> {
@@ -121,6 +137,7 @@ impl std::fmt::Debug for ParquetSerializerConfig {
             .field("compression", &self.compression)
             .field("row_group_size", &self.row_group_size)
             .field("allow_nullable_fields", &self.allow_nullable_fields)
+            .field("estimated_output_size", &self.estimated_output_size)
             .finish()
     }
 }
@@ -133,6 +150,7 @@ impl ParquetSerializerConfig {
             compression: ParquetCompression::default(),
             row_group_size: None,
             allow_nullable_fields: false,
+            estimated_output_size: None,
         }
     }
 
@@ -152,6 +170,7 @@ impl ParquetSerializerConfig {
 pub struct ParquetSerializer {
     schema: Arc<Schema>,
     writer_properties: WriterProperties,
+    estimated_output_size: Option<usize>,
 }
 
 impl ParquetSerializer {
@@ -194,6 +213,7 @@ impl ParquetSerializer {
         Ok(Self {
             schema,
             writer_properties,
+            estimated_output_size: config.estimated_output_size,
         })
     }
 }
@@ -206,9 +226,15 @@ impl tokio_util::codec::Encoder<Vec<Event>> for ParquetSerializer {
             return Err(ParquetEncodingError::NoEvents);
         }
 
-        let bytes = encode_events_to_parquet(&events, Arc::clone(&self.schema), &self.writer_properties)?;
+        let bytes = encode_events_to_parquet(
+            &events,
+            Arc::clone(&self.schema),
+            &self.writer_properties,
+            self.estimated_output_size,
+        )?;
 
-        buffer.extend_from_slice(&bytes);
+        // Use put() instead of extend_from_slice to avoid copying when possible
+        buffer.put(bytes);
         Ok(())
     }
 }
@@ -269,6 +295,7 @@ pub fn encode_events_to_parquet(
     events: &[Event],
     schema: Arc<Schema>,
     writer_properties: &WriterProperties,
+    estimated_output_size: Option<usize>,
 ) -> Result<Bytes, ParquetEncodingError> {
     if events.is_empty() {
         return Err(ParquetEncodingError::NoEvents);
@@ -277,19 +304,42 @@ pub fn encode_events_to_parquet(
     // Build Arrow RecordBatch from events (reuses Arrow encoder logic)
     let record_batch = build_record_batch(schema, events)?;
 
-    // Write RecordBatch to Parquet format in memory
-    let mut buffer = Vec::new();
+    // Get batch metadata before we move into writer scope
+    let batch_schema = record_batch.schema();
+
+    // Calculate buffer capacity to avoid reallocations
+    // This is critical for memory efficiency with large batches
+    let buffer_capacity = estimated_output_size.unwrap_or_else(|| {
+        // Heuristic: Estimate based on number of events and fields
+        // Assuming average 2KB per event after compression (conservative estimate)
+        // Users should tune estimated_output_size based on actual data for best results
+        let estimated_size = events.len() * 2048;
+
+        // Cap at reasonable maximum to avoid over-allocation for small batches
+        estimated_size.min(128 * 1024 * 1024) // Cap at 128MB
+    });
+
+    // Write RecordBatch to Parquet format in memory with pre-allocated buffer
+    let mut buffer = Vec::with_capacity(buffer_capacity);
     {
         let mut writer = ArrowWriter::try_new(
             &mut buffer,
-            record_batch.schema(),
+            batch_schema,
             Some(writer_properties.clone()),
         )?;
 
         writer.write(&record_batch)?;
+
+        // Explicitly drop RecordBatch to release Arrow array memory immediately
+        drop(record_batch);
+
+        // close() consumes the writer, releasing compression buffers
         writer.close()?;
     }
 
+    // Shrink buffer to actual size to free excess pre-allocated capacity
+    buffer.shrink_to_fit();
+
     Ok(Bytes::from(buffer))
 }
 
@@ -337,7 +387,7 @@ mod tests {
             .set_compression(Compression::SNAPPY)
             .build();
 
-        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
         assert!(result.is_ok());
 
         let bytes = result.unwrap();
@@ -441,7 +491,7 @@ mod tests {
 
         let props = WriterProperties::builder().build();
 
-        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
         assert!(result.is_ok());
 
         let bytes = result.unwrap();
@@ -481,7 +531,7 @@ mod tests {
             true,
         )]));
         let props = WriterProperties::builder().build();
-        let result = encode_events_to_parquet(&events, schema, &props);
+        let result = encode_events_to_parquet(&events, schema, &props, None);
         assert!(result.is_err());
         assert!(matches!(result.unwrap_err(), ParquetEncodingError::NoEvents));
     }
@@ -511,7 +561,7 @@ mod tests {
                 .set_compression(compression.into())
                 .build();
 
-            let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+            let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
             assert!(result.is_ok(), "Failed with compression: {:?}", compression);
 
             // Verify we can read it back
@@ -538,6 +588,7 @@ mod tests {
             compression: ParquetCompression::Zstd,
             row_group_size: Some(1000),
             allow_nullable_fields: false,
+            estimated_output_size: None,
         };
 
         let serializer = ParquetSerializer::new(config);
@@ -551,6 +602,7 @@ mod tests {
             compression: ParquetCompression::default(),
             row_group_size: None,
             allow_nullable_fields: false,
+            estimated_output_size: None,
         };
 
         let result = ParquetSerializer::new(config);
@@ -608,7 +660,7 @@ mod tests {
             .set_max_row_group_size(5000) // 2 row groups
             .build();
 
-        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
         assert!(result.is_ok());
 
         let bytes = result.unwrap();
diff --git a/website/cue/reference/components/sinks/aws_s3.cue b/website/cue/reference/components/sinks/aws_s3.cue
index 4ada477060448..e7c9ee7bf2465 100644
--- a/website/cue/reference/components/sinks/aws_s3.cue
+++ b/website/cue/reference/components/sinks/aws_s3.cue
@@ -155,6 +155,7 @@ components: sinks: aws_s3: components._aws & {
 				        compression: zstd
 				        row_group_size: 50000  # Should be <= batch.max_events
 				        allow_nullable_fields: true
+				        estimated_output_size: 10485760  # 10MB - tune based on your data
 				```
 
 				## Supported Data Types
@@ -223,6 +224,28 @@ components: sinks: aws_s3: components._aws & {
 				would normally be non-nullable. This is useful when working with downstream systems that
 				can handle NULL values through defaults or computed columns.
 
+				### estimated_output_size
+
+				Estimated compressed output size in bytes for buffer pre-allocation. This is an optional
+				performance tuning parameter that can significantly reduce memory overhead by pre-allocating
+				the output buffer to an appropriate size, avoiding repeated reallocations during encoding.
+
+				**How to set this value:**
+				1. Monitor actual compressed Parquet file sizes in production
+				2. Set to approximately 1.2x your average observed compressed size for headroom
+				3. ZSTD compression typically achieves 3-10x compression on JSON/log data
+
+				**Example:** If your batches are 100MB uncompressed and compress to 10MB on average,
+				set `estimated_output_size: 12582912` (12MB) to provide some headroom.
+
+				If not specified, Vector uses a heuristic based on estimated uncompressed size
+				(approximately 2KB per event, capped at 128MB).
+
+				**Trade-offs:**
+				- **Too small**: Minimal benefit, will still require reallocations
+				- **Too large**: Wastes memory by over-allocating
+				- **Just right**: Optimal memory usage with minimal reallocations
+
 				## Batching Behavior
 
 				Each batch of events becomes **one Parquet file** in S3. The batch size is controlled by:
diff --git a/website/cue/reference/components/sinks/generated/aws_s3.cue b/website/cue/reference/components/sinks/generated/aws_s3.cue
index 733368b35cdbd..308be6082dd0c 100644
--- a/website/cue/reference/components/sinks/generated/aws_s3.cue
+++ b/website/cue/reference/components/sinks/generated/aws_s3.cue
@@ -716,6 +716,26 @@ generated: components: sinks: aws_s3: configuration: {
 						required: false
 						type: bool: default: false
 					}
+					estimated_output_size: {
+						description: """
+							Estimated compressed output size in bytes for buffer pre-allocation.
+
+							Pre-allocating the output buffer based on expected compressed size significantly
+							reduces memory overhead by avoiding repeated reallocations during encoding.
+							If not specified, defaults to a heuristic based on estimated uncompressed size.
+
+							Guidelines for setting this value:
+							- Monitor actual compressed output sizes in production
+							- Set to ~1.2x your average observed compressed size for headroom
+							- ZSTD typically achieves 3-10x compression on JSON data
+							- Example: If batches are 100MB uncompressed and compress to 10MB, set to ~12MB
+							"""
+						required: false
+						type: uint: {
+							default: null
+							examples: [10485760, 52428800]
+						}
+					}
 				}
 			}
 			protobuf: {

From b54228fef97bfd40b2e64808b1efa243f56602ea Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Sun, 14 Dec 2025 00:48:17 +0100
Subject: [PATCH 06/13] Added ability to write bloom filters to parquet files.
 Configurable but defaulted to off

---
 lib/codecs/src/encoding/format/parquet.rs     | 78 +++++++++++++++++++
 .../cue/reference/components/sinks/aws_s3.cue | 57 ++++++++++++++
 .../components/sinks/generated/aws_s3.cue     | 62 +++++++++++++++
 3 files changed, 197 insertions(+)

diff --git a/lib/codecs/src/encoding/format/parquet.rs b/lib/codecs/src/encoding/format/parquet.rs
index 1315a859afab4..7ef8f0e196a79 100644
--- a/lib/codecs/src/encoding/format/parquet.rs
+++ b/lib/codecs/src/encoding/format/parquet.rs
@@ -120,6 +120,64 @@ pub struct ParquetSerializerConfig {
     #[configurable(metadata(docs::examples = 10485760))]  // 10MB
     #[configurable(metadata(docs::examples = 52428800))]  // 50MB
     pub estimated_output_size: Option<usize>,
+
+    /// Enable Bloom filters for all columns.
+    ///
+    /// Bloom filters are probabilistic data structures that can significantly improve
+    /// query performance by allowing query engines to skip entire row groups when
+    /// searching for specific values. They are especially effective for:
+    /// - High-cardinality columns (UUIDs, user IDs, session IDs)
+    /// - String columns (URLs, emails, tags)
+    /// - Point queries (WHERE column = 'value')
+    /// - IN clauses (WHERE column IN (...))
+    ///
+    /// Trade-offs:
+    /// - Pros: Faster queries, better row group pruning in engines like Athena/Spark
+    /// - Cons: Slightly larger file sizes (typically 1-5% overhead), minimal write overhead
+    ///
+    /// When disabled (default), no Bloom filters are written.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = true))]
+    #[configurable(metadata(docs::examples = false))]
+    pub enable_bloom_filters: bool,
+
+    /// False positive probability for Bloom filters.
+    ///
+    /// This controls the trade-off between Bloom filter size and accuracy.
+    /// Lower values produce larger but more accurate filters.
+    ///
+    /// - Default: 0.05 (5% false positive rate)
+    /// - Range: Must be between 0.0 and 1.0 (exclusive)
+    /// - Recommended: 0.01 (1%) for high-selectivity queries, 0.05 (5%) for general use
+    ///
+    /// Only takes effect when enable_bloom_filters is true.
+    #[serde(default = "default_bloom_fpp")]
+    #[configurable(metadata(docs::examples = 0.05))]
+    #[configurable(metadata(docs::examples = 0.01))]
+    pub bloom_filter_fpp: f64,
+
+    /// Estimated number of distinct values for Bloom filter sizing.
+    ///
+    /// This should match the expected cardinality of your columns. Higher values
+    /// result in larger Bloom filters. If your actual distinct value count significantly
+    /// exceeds this number, the false positive rate may increase.
+    ///
+    /// - Default: 1,000,000
+    /// - Recommended: Set based on your data's actual cardinality
+    ///
+    /// Only takes effect when enable_bloom_filters is true.
+    #[serde(default = "default_bloom_ndv")]
+    #[configurable(metadata(docs::examples = 1000000))]
+    #[configurable(metadata(docs::examples = 10000000))]
+    pub bloom_filter_ndv: u64,
+}
+
+fn default_bloom_fpp() -> f64 {
+    0.05
+}
+
+fn default_bloom_ndv() -> u64 {
+    1_000_000
 }
 
 fn schema_example() -> std::collections::BTreeMap<String, String> {
@@ -138,6 +196,9 @@ impl std::fmt::Debug for ParquetSerializerConfig {
             .field("row_group_size", &self.row_group_size)
             .field("allow_nullable_fields", &self.allow_nullable_fields)
             .field("estimated_output_size", &self.estimated_output_size)
+            .field("enable_bloom_filters", &self.enable_bloom_filters)
+            .field("bloom_filter_fpp", &self.bloom_filter_fpp)
+            .field("bloom_filter_ndv", &self.bloom_filter_ndv)
             .finish()
     }
 }
@@ -151,6 +212,9 @@ impl ParquetSerializerConfig {
             row_group_size: None,
             allow_nullable_fields: false,
             estimated_output_size: None,
+            enable_bloom_filters: false,
+            bloom_filter_fpp: default_bloom_fpp(),
+            bloom_filter_ndv: default_bloom_ndv(),
         }
     }
 
@@ -208,6 +272,14 @@ impl ParquetSerializer {
             props_builder = props_builder.set_max_row_group_size(row_group_size);
         }
 
+        // Enable Bloom filters if configured
+        if config.enable_bloom_filters {
+            props_builder = props_builder
+                .set_bloom_filter_enabled(true)
+                .set_bloom_filter_fpp(config.bloom_filter_fpp)
+                .set_bloom_filter_ndv(config.bloom_filter_ndv);
+        }
+
         let writer_properties = props_builder.build();
 
         Ok(Self {
@@ -589,6 +661,9 @@ mod tests {
             row_group_size: Some(1000),
             allow_nullable_fields: false,
             estimated_output_size: None,
+            enable_bloom_filters: false,
+            bloom_filter_fpp: default_bloom_fpp(),
+            bloom_filter_ndv: default_bloom_ndv(),
         };
 
         let serializer = ParquetSerializer::new(config);
@@ -603,6 +678,9 @@ mod tests {
             row_group_size: None,
             allow_nullable_fields: false,
             estimated_output_size: None,
+            enable_bloom_filters: false,
+            bloom_filter_fpp: default_bloom_fpp(),
+            bloom_filter_ndv: default_bloom_ndv(),
         };
 
         let result = ParquetSerializer::new(config);
diff --git a/website/cue/reference/components/sinks/aws_s3.cue b/website/cue/reference/components/sinks/aws_s3.cue
index e7c9ee7bf2465..17c51f1688697 100644
--- a/website/cue/reference/components/sinks/aws_s3.cue
+++ b/website/cue/reference/components/sinks/aws_s3.cue
@@ -156,6 +156,9 @@ components: sinks: aws_s3: components._aws & {
 				        row_group_size: 50000  # Should be <= batch.max_events
 				        allow_nullable_fields: true
 				        estimated_output_size: 10485760  # 10MB - tune based on your data
+				        enable_bloom_filters: true  # Enable for better query performance
+				        bloom_filter_fpp: 0.05  # 5% false positive rate
+				        bloom_filter_ndv: 1000000  # Expected distinct values
 				```
 
 				## Supported Data Types
@@ -246,6 +249,60 @@ components: sinks: aws_s3: components._aws & {
 				- **Too large**: Wastes memory by over-allocating
 				- **Just right**: Optimal memory usage with minimal reallocations
 
+				### enable_bloom_filters
+
+				Enable Bloom filters for all columns in the Parquet file. Bloom filters are probabilistic
+				data structures that can significantly improve query performance by allowing query engines
+				(like AWS Athena, Apache Spark, and Presto) to skip entire row groups when searching for
+				specific values without reading the actual data.
+
+				**When to enable:**
+				- High-cardinality columns: UUIDs, user IDs, session IDs, transaction IDs
+				- String columns frequently used in WHERE clauses: URLs, emails, tags, names
+				- Point queries: `WHERE user_id = 'abc123'`
+				- IN clause queries: `WHERE id IN ('x', 'y', 'z')`
+
+				**Trade-offs:**
+				- **Pros**: Significantly faster queries, better row group pruning, reduced I/O
+				- **Cons**: Slightly larger file sizes (typically 1-5% overhead), minimal write overhead
+
+				**Default**: `false` (disabled)
+
+				### bloom_filter_fpp
+
+				False positive probability (FPP) for Bloom filters. This controls the trade-off between
+				Bloom filter size and accuracy. Lower values produce larger but more accurate filters.
+
+				- **Default**: `0.05` (5% false positive rate)
+				- **Range**: Must be between 0.0 and 1.0 (exclusive)
+				- **Recommended values**:
+				  - `0.05` (5%): Good balance for general use
+				  - `0.01` (1%): Better for high-selectivity queries where precision matters
+				  - `0.10` (10%): Smaller filters when storage is a concern
+
+				A false positive means the Bloom filter indicates a value *might* be in a row group when it
+				actually isn't, requiring the engine to read and filter that row group. Lower FPP means fewer
+				unnecessary reads.
+
+				Only takes effect when `enable_bloom_filters` is `true`.
+
+				### bloom_filter_ndv
+
+				Estimated number of distinct values (NDV) for Bloom filter sizing. This should match the
+				expected cardinality of your columns. Higher values result in larger Bloom filters.
+
+				- **Default**: `1,000,000`
+				- **Recommendation**: Analyze your data to determine actual cardinality
+				  - Low cardinality (countries, states): `1,000` - `100,000`
+				  - Medium cardinality (cities, products): `100,000` - `1,000,000`
+				  - High cardinality (user IDs, UUIDs): `10,000,000+`
+
+				**Important**: If your actual distinct value count significantly exceeds this number, the
+				false positive rate may increase beyond the configured `bloom_filter_fpp`, reducing query
+				performance gains.
+
+				Only takes effect when `enable_bloom_filters` is `true`.
+
 				## Batching Behavior
 
 				Each batch of events becomes **one Parquet file** in S3. The batch size is controlled by:
diff --git a/website/cue/reference/components/sinks/generated/aws_s3.cue b/website/cue/reference/components/sinks/generated/aws_s3.cue
index 308be6082dd0c..65477e6ae498a 100644
--- a/website/cue/reference/components/sinks/generated/aws_s3.cue
+++ b/website/cue/reference/components/sinks/generated/aws_s3.cue
@@ -736,6 +736,68 @@ generated: components: sinks: aws_s3: configuration: {
 							examples: [10485760, 52428800]
 						}
 					}
+					enable_bloom_filters: {
+						description: """
+							Enable Bloom filters for all columns.
+
+							Bloom filters are probabilistic data structures that can significantly improve
+							query performance by allowing query engines to skip entire row groups when
+							searching for specific values. They are especially effective for:
+							- High-cardinality columns (UUIDs, user IDs, session IDs)
+							- String columns (URLs, emails, tags)
+							- Point queries (WHERE column = 'value')
+							- IN clauses (WHERE column IN (...))
+
+							Trade-offs:
+							- Pros: Faster queries, better row group pruning in engines like Athena/Spark
+							- Cons: Slightly larger file sizes (typically 1-5% overhead), minimal write overhead
+
+							When disabled (default), no Bloom filters are written.
+							"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true, false]
+						}
+					}
+					bloom_filter_fpp: {
+						description: """
+							False positive probability for Bloom filters.
+
+							This controls the trade-off between Bloom filter size and accuracy.
+							Lower values produce larger but more accurate filters.
+
+							- Default: 0.05 (5% false positive rate)
+							- Range: Must be between 0.0 and 1.0 (exclusive)
+							- Recommended: 0.01 (1%) for high-selectivity queries, 0.05 (5%) for general use
+
+							Only takes effect when enable_bloom_filters is true.
+							"""
+						required: false
+						type: float: {
+							default: 0.05
+							examples: [0.05, 0.01]
+						}
+					}
+					bloom_filter_ndv: {
+						description: """
+							Estimated number of distinct values for Bloom filter sizing.
+
+							This should match the expected cardinality of your columns. Higher values
+							result in larger Bloom filters. If your actual distinct value count significantly
+							exceeds this number, the false positive rate may increase.
+
+							- Default: 1,000,000
+							- Recommended: Set based on your data's actual cardinality
+
+							Only takes effect when enable_bloom_filters is true.
+							"""
+						required: false
+						type: uint: {
+							default: 1000000
+							examples: [1000000, 10000000]
+						}
+					}
 				}
 			}
 			protobuf: {

From 9c9bb9e81d7714f2454344d2ff95ebefda9d9582 Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Sun, 14 Dec 2025 18:28:35 +0100
Subject: [PATCH 07/13] Use parquet V2 by default, added compression level
 support, added schema inference

---
 lib/codecs/src/encoding/format/parquet.rs     | 742 ++++++++++++++----
 .../src/encoding/format/schema_definition.rs  | 249 +++++-
 lib/codecs/src/encoding/serializer.rs         |  22 +-
 src/codecs/encoding/config.rs                 |   4 +-
 src/components/validation/resources/mod.rs    |   2 +-
 .../cue/reference/components/sinks/aws_s3.cue | 402 ++++++++--
 .../components/sinks/generated/amqp.cue       | 327 ++++++++
 .../sinks/generated/aws_cloudwatch_logs.cue   | 327 ++++++++
 .../sinks/generated/aws_kinesis_firehose.cue  | 327 ++++++++
 .../sinks/generated/aws_kinesis_streams.cue   | 327 ++++++++
 .../components/sinks/generated/aws_s3.cue     | 366 ++++++---
 .../components/sinks/generated/aws_sns.cue    | 327 ++++++++
 .../components/sinks/generated/aws_sqs.cue    | 327 ++++++++
 .../components/sinks/generated/azure_blob.cue | 327 ++++++++
 .../components/sinks/generated/console.cue    | 327 ++++++++
 .../components/sinks/generated/file.cue       | 327 ++++++++
 .../generated/gcp_chronicle_unstructured.cue  | 327 ++++++++
 .../sinks/generated/gcp_cloud_storage.cue     | 327 ++++++++
 .../components/sinks/generated/gcp_pubsub.cue | 327 ++++++++
 .../components/sinks/generated/http.cue       | 327 ++++++++
 .../components/sinks/generated/humio_logs.cue | 327 ++++++++
 .../components/sinks/generated/kafka.cue      | 327 ++++++++
 .../components/sinks/generated/loki.cue       | 327 ++++++++
 .../components/sinks/generated/mqtt.cue       | 327 ++++++++
 .../components/sinks/generated/nats.cue       | 327 ++++++++
 .../sinks/generated/opentelemetry.cue         | 327 ++++++++
 .../components/sinks/generated/papertrail.cue | 327 ++++++++
 .../components/sinks/generated/pulsar.cue     | 327 ++++++++
 .../components/sinks/generated/redis.cue      | 327 ++++++++
 .../components/sinks/generated/socket.cue     | 327 ++++++++
 .../sinks/generated/splunk_hec_logs.cue       | 327 ++++++++
 .../components/sinks/generated/webhdfs.cue    | 327 ++++++++
 .../components/sinks/generated/websocket.cue  | 327 ++++++++
 .../sinks/generated/websocket_server.cue      | 327 ++++++++
 34 files changed, 10240 insertions(+), 376 deletions(-)

diff --git a/lib/codecs/src/encoding/format/parquet.rs b/lib/codecs/src/encoding/format/parquet.rs
index 7ef8f0e196a79..f487d0a0f583d 100644
--- a/lib/codecs/src/encoding/format/parquet.rs
+++ b/lib/codecs/src/encoding/format/parquet.rs
@@ -8,8 +8,9 @@ use arrow::datatypes::Schema;
 use bytes::{Bytes, BytesMut, BufMut};
 use parquet::{
     arrow::ArrowWriter,
-    basic::{Compression, ZstdLevel},
-    file::properties::WriterProperties,
+    basic::{Compression, ZstdLevel, GzipLevel, BrotliLevel},
+    file::properties::{WriterProperties, WriterVersion},
+    schema::types::ColumnPath,
 };
 use snafu::Snafu;
 use std::sync::Arc;
@@ -41,15 +42,58 @@ pub enum ParquetCompression {
     Zstd,
 }
 
+impl ParquetCompression {
+    /// Convert to parquet Compression with optional level override
+    fn to_compression(&self, level: Option<i32>) -> Result<Compression, String> {
+        match (self, level) {
+            (ParquetCompression::Uncompressed, _) => Ok(Compression::UNCOMPRESSED),
+            (ParquetCompression::Snappy, _) => Ok(Compression::SNAPPY),
+            (ParquetCompression::Lz4, _) => Ok(Compression::LZ4),
+            (ParquetCompression::Gzip, Some(lvl)) => {
+                GzipLevel::try_new(lvl as u32)
+                    .map(Compression::GZIP)
+                    .map_err(|e| format!("Invalid GZIP compression level: {}", e))
+            }
+            (ParquetCompression::Gzip, None) => Ok(Compression::GZIP(Default::default())),
+            (ParquetCompression::Brotli, Some(lvl)) => {
+                BrotliLevel::try_new(lvl as u32)
+                    .map(Compression::BROTLI)
+                    .map_err(|e| format!("Invalid Brotli compression level: {}", e))
+            }
+            (ParquetCompression::Brotli, None) => Ok(Compression::BROTLI(Default::default())),
+            (ParquetCompression::Zstd, Some(lvl)) => {
+                ZstdLevel::try_new(lvl)
+                    .map(Compression::ZSTD)
+                    .map_err(|e| format!("Invalid ZSTD compression level: {}", e))
+            }
+            (ParquetCompression::Zstd, None) => Ok(Compression::ZSTD(ZstdLevel::default())),
+        }
+    }
+}
+
 impl From<ParquetCompression> for Compression {
     fn from(compression: ParquetCompression) -> Self {
-        match compression {
-            ParquetCompression::Uncompressed => Compression::UNCOMPRESSED,
-            ParquetCompression::Snappy => Compression::SNAPPY,
-            ParquetCompression::Gzip => Compression::GZIP(Default::default()),
-            ParquetCompression::Brotli => Compression::BROTLI(Default::default()),
-            ParquetCompression::Lz4 => Compression::LZ4,
-            ParquetCompression::Zstd => Compression::ZSTD(ZstdLevel::default()),
+        compression.to_compression(None).expect("Default compression should always be valid")
+    }
+}
+
+/// Parquet writer version
+#[configurable_component]
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+#[serde(rename_all = "lowercase")]
+pub enum ParquetWriterVersion {
+    /// Parquet format version 1.0 (maximum compatibility)
+    V1,
+    /// Parquet format version 2.0 (modern format with better encoding)
+    #[default]
+    V2,
+}
+
+impl From<ParquetWriterVersion> for WriterVersion {
+    fn from(version: ParquetWriterVersion) -> Self {
+        match version {
+            ParquetWriterVersion::V1 => WriterVersion::PARQUET_1_0,
+            ParquetWriterVersion::V2 => WriterVersion::PARQUET_2_0,
         }
     }
 }
@@ -63,6 +107,8 @@ pub struct ParquetSerializerConfig {
     /// This schema defines the structure and types of the Parquet file columns.
     /// Specified as a map of field names to data types.
     ///
+    /// Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+    ///
     /// Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
     /// float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
     /// timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
@@ -70,6 +116,55 @@ pub struct ParquetSerializerConfig {
     #[configurable(metadata(docs::examples = "schema_example()"))]
     pub schema: Option<SchemaDefinition>,
 
+    /// Automatically infer schema from event data
+    ///
+    /// When enabled, the schema is inferred from each batch of events independently.
+    /// The schema is determined by examining the types of values in the events.
+    ///
+    /// **Type mapping:**
+    /// - String values → `utf8`
+    /// - Integer values → `int64`
+    /// - Float values → `float64`
+    /// - Boolean values → `boolean`
+    /// - Timestamp values → `timestamp_microsecond`
+    /// - Arrays/Objects → `utf8` (serialized as JSON)
+    ///
+    /// **Type conflicts:** If a field has different types across events in the same batch,
+    /// it will be encoded as `utf8` (string) and all values will be converted to strings.
+    ///
+    /// **Important:** Schema consistency across batches is the operator's responsibility.
+    /// Use VRL transforms to ensure consistent types if needed. Each batch may produce
+    /// a different schema if event structure varies.
+    ///
+    /// **Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+    ///
+    /// Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = true))]
+    pub infer_schema: bool,
+
+    /// Column names to exclude from Parquet encoding
+    ///
+    /// These columns will be completely excluded from the Parquet file.
+    /// Useful for filtering out metadata, internal fields, or temporary data.
+    ///
+    /// Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = "vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"))]
+    pub exclude_columns: Option<Vec<String>>,
+
+    /// Maximum number of columns to encode
+    ///
+    /// Limits the number of columns in the Parquet file. Additional columns beyond
+    /// this limit will be silently dropped. Columns are selected in the order they
+    /// appear in the first event.
+    ///
+    /// Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+    #[serde(default = "default_max_columns")]
+    #[configurable(metadata(docs::examples = 500))]
+    #[configurable(metadata(docs::examples = 1000))]
+    pub max_columns: usize,
+
     /// Compression algorithm to use for Parquet columns
     ///
     /// Compression is applied to all columns in the Parquet file.
@@ -80,6 +175,53 @@ pub struct ParquetSerializerConfig {
     #[configurable(metadata(docs::examples = "zstd"))]
     pub compression: ParquetCompression,
 
+    /// Compression level for algorithms that support it.
+    ///
+    /// Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+    ///
+    /// **ZSTD levels** (1-22):
+    /// - 1-3: Fastest, moderate compression (level 3 is default)
+    /// - 4-9: Good balance of speed and compression
+    /// - 10-15: Better compression, slower encoding
+    /// - 16-22: Maximum compression, slowest (good for cold storage)
+    ///
+    /// **GZIP levels** (1-9):
+    /// - 1-3: Faster, less compression
+    /// - 6: Default balance (recommended)
+    /// - 9: Maximum compression, slowest
+    ///
+    /// **Brotli levels** (0-11):
+    /// - 0-4: Faster encoding
+    /// - 1: Default (recommended)
+    /// - 5-11: Better compression, slower
+    ///
+    /// Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+    /// Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = 3))]
+    #[configurable(metadata(docs::examples = 6))]
+    #[configurable(metadata(docs::examples = 10))]
+    pub compression_level: Option<i32>,
+
+    /// Parquet format writer version.
+    ///
+    /// Controls which Parquet format version to write:
+    /// - **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+    /// - **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+    ///
+    /// Version 2 benefits:
+    /// - More efficient encoding for certain data types (10-20% smaller files)
+    /// - Better statistics for query optimization
+    /// - Improved data page format
+    /// - Required for some advanced features
+    ///
+    /// Use v1 for maximum compatibility with older readers (pre-2018 tools).
+    /// Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = "v1"))]
+    #[configurable(metadata(docs::examples = "v2"))]
+    pub writer_version: ParquetWriterVersion,
+
     /// Number of rows per row group
     ///
     /// Row groups are Parquet's unit of parallelization. Larger row groups
@@ -105,100 +247,111 @@ pub struct ParquetSerializerConfig {
     #[configurable(metadata(docs::examples = true))]
     pub allow_nullable_fields: bool,
 
-    /// Estimated compressed output size in bytes for buffer pre-allocation.
+    /// Sorting order for rows within row groups.
     ///
-    /// Pre-allocating the output buffer based on expected compressed size significantly
-    /// reduces memory overhead by avoiding repeated reallocations during encoding.
-    /// If not specified, defaults to a heuristic based on estimated uncompressed size.
+    /// Pre-sorting rows by specified columns before writing can significantly improve both
+    /// compression ratios and query performance. This is especially valuable for time-series
+    /// data and event logs.
     ///
-    /// Guidelines for setting this value:
-    /// - Monitor actual compressed output sizes in production
-    /// - Set to ~1.2x your average observed compressed size for headroom
-    /// - ZSTD typically achieves 3-10x compression on JSON data
-    /// - Example: If batches are 100MB uncompressed and compress to 10MB, set to ~12MB
-    #[serde(default)]
-    #[configurable(metadata(docs::examples = 10485760))]  // 10MB
-    #[configurable(metadata(docs::examples = 52428800))]  // 50MB
-    pub estimated_output_size: Option<usize>,
-
-    /// Enable Bloom filters for all columns.
+    /// **Benefits:**
+    /// - **Better compression** (20-40% smaller files): Similar values are grouped together
+    /// - **Faster queries**: More effective min/max statistics enable better row group skipping
+    /// - **Improved caching**: Query engines can more efficiently cache sorted data
     ///
-    /// Bloom filters are probabilistic data structures that can significantly improve
-    /// query performance by allowing query engines to skip entire row groups when
-    /// searching for specific values. They are especially effective for:
-    /// - High-cardinality columns (UUIDs, user IDs, session IDs)
-    /// - String columns (URLs, emails, tags)
-    /// - Point queries (WHERE column = 'value')
-    /// - IN clauses (WHERE column IN (...))
+    /// **Common patterns:**
+    /// - Time-series: Sort by timestamp descending (most recent first)
+    /// - Multi-tenant: Sort by tenant_id, then timestamp
+    /// - User analytics: Sort by user_id, then event_time
     ///
-    /// Trade-offs:
-    /// - Pros: Faster queries, better row group pruning in engines like Athena/Spark
-    /// - Cons: Slightly larger file sizes (typically 1-5% overhead), minimal write overhead
+    /// **Trade-offs:**
+    /// - Adds sorting overhead during encoding (typically 10-30% slower writes)
+    /// - Requires buffering entire batch in memory for sorting
+    /// - Most beneficial when queries frequently filter on sorted columns
     ///
-    /// When disabled (default), no Bloom filters are written.
-    #[serde(default)]
-    #[configurable(metadata(docs::examples = true))]
-    #[configurable(metadata(docs::examples = false))]
-    pub enable_bloom_filters: bool,
-
-    /// False positive probability for Bloom filters.
-    ///
-    /// This controls the trade-off between Bloom filter size and accuracy.
-    /// Lower values produce larger but more accurate filters.
-    ///
-    /// - Default: 0.05 (5% false positive rate)
-    /// - Range: Must be between 0.0 and 1.0 (exclusive)
-    /// - Recommended: 0.01 (1%) for high-selectivity queries, 0.05 (5%) for general use
-    ///
-    /// Only takes effect when enable_bloom_filters is true.
-    #[serde(default = "default_bloom_fpp")]
-    #[configurable(metadata(docs::examples = 0.05))]
-    #[configurable(metadata(docs::examples = 0.01))]
-    pub bloom_filter_fpp: f64,
-
-    /// Estimated number of distinct values for Bloom filter sizing.
-    ///
-    /// This should match the expected cardinality of your columns. Higher values
-    /// result in larger Bloom filters. If your actual distinct value count significantly
-    /// exceeds this number, the false positive rate may increase.
+    /// **Example:**
+    /// ```yaml
+    /// sorting_columns:
+    ///   - column: timestamp
+    ///     descending: true
+    ///   - column: user_id
+    ///     descending: false
+    /// ```
     ///
-    /// - Default: 1,000,000
-    /// - Recommended: Set based on your data's actual cardinality
-    ///
-    /// Only takes effect when enable_bloom_filters is true.
-    #[serde(default = "default_bloom_ndv")]
-    #[configurable(metadata(docs::examples = 1000000))]
-    #[configurable(metadata(docs::examples = 10000000))]
-    pub bloom_filter_ndv: u64,
+    /// If not specified, rows are written in the order they appear in the batch.
+    #[serde(default)]
+    pub sorting_columns: Option<Vec<SortingColumnConfig>>,
 }
 
-fn default_bloom_fpp() -> f64 {
-    0.05
+/// Column sorting configuration
+#[configurable_component]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct SortingColumnConfig {
+    /// Name of the column to sort by
+    #[configurable(metadata(docs::examples = "timestamp"))]
+    #[configurable(metadata(docs::examples = "user_id"))]
+    pub column: String,
+
+    /// Sort in descending order (true) or ascending order (false)
+    ///
+    /// - `true`: Descending (Z-A, 9-0, newest-oldest)
+    /// - `false`: Ascending (A-Z, 0-9, oldest-newest)
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = true))]
+    pub descending: bool,
 }
 
-fn default_bloom_ndv() -> u64 {
-    1_000_000
+fn default_max_columns() -> usize {
+    1000
 }
 
-fn schema_example() -> std::collections::BTreeMap<String, String> {
-    let mut map = std::collections::BTreeMap::new();
-    map.insert("id".to_string(), "int64".to_string());
-    map.insert("name".to_string(), "utf8".to_string());
-    map.insert("timestamp".to_string(), "timestamp_microsecond".to_string());
-    map
+fn schema_example() -> SchemaDefinition {
+    use std::collections::BTreeMap;
+    use super::schema_definition::FieldDefinition;
+
+    let mut fields = BTreeMap::new();
+    fields.insert(
+        "id".to_string(),
+        FieldDefinition {
+            r#type: "int64".to_string(),
+            bloom_filter: false,
+            bloom_filter_num_distinct_values: None,
+            bloom_filter_false_positive_pct: None,
+        },
+    );
+    fields.insert(
+        "name".to_string(),
+        FieldDefinition {
+            r#type: "utf8".to_string(),
+            bloom_filter: true,  // Example: enable for high-cardinality string field
+            bloom_filter_num_distinct_values: Some(1_000_000),
+            bloom_filter_false_positive_pct: Some(0.01),
+        },
+    );
+    fields.insert(
+        "timestamp".to_string(),
+        FieldDefinition {
+            r#type: "timestamp_microsecond".to_string(),
+            bloom_filter: false,
+            bloom_filter_num_distinct_values: None,
+            bloom_filter_false_positive_pct: None,
+        },
+    );
+    SchemaDefinition { fields }
 }
 
 impl std::fmt::Debug for ParquetSerializerConfig {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("ParquetSerializerConfig")
             .field("schema", &self.schema.is_some())
+            .field("infer_schema", &self.infer_schema)
+            .field("exclude_columns", &self.exclude_columns)
+            .field("max_columns", &self.max_columns)
             .field("compression", &self.compression)
+            .field("compression_level", &self.compression_level)
+            .field("writer_version", &self.writer_version)
             .field("row_group_size", &self.row_group_size)
             .field("allow_nullable_fields", &self.allow_nullable_fields)
-            .field("estimated_output_size", &self.estimated_output_size)
-            .field("enable_bloom_filters", &self.enable_bloom_filters)
-            .field("bloom_filter_fpp", &self.bloom_filter_fpp)
-            .field("bloom_filter_ndv", &self.bloom_filter_ndv)
+            .field("sorting_columns", &self.sorting_columns)
             .finish()
     }
 }
@@ -208,13 +361,25 @@ impl ParquetSerializerConfig {
     pub fn new(schema: SchemaDefinition) -> Self {
         Self {
             schema: Some(schema),
+            infer_schema: false,
+            exclude_columns: None,
+            max_columns: default_max_columns(),
             compression: ParquetCompression::default(),
+            compression_level: None,
+            writer_version: ParquetWriterVersion::default(),
             row_group_size: None,
             allow_nullable_fields: false,
-            estimated_output_size: None,
-            enable_bloom_filters: false,
-            bloom_filter_fpp: default_bloom_fpp(),
-            bloom_filter_ndv: default_bloom_ndv(),
+            sorting_columns: None,
+        }
+    }
+
+    /// Validate the configuration
+    fn validate(&self) -> Result<(), String> {
+        // Must specify exactly one schema method
+        match (self.schema.is_some(), self.infer_schema) {
+            (true, true) => Err("Cannot use both 'schema' and 'infer_schema: true'. Choose one.".to_string()),
+            (false, false) => Err("Must specify either 'schema' or 'infer_schema: true'".to_string()),
+            _ => Ok(())
         }
     }
 
@@ -229,63 +394,151 @@ impl ParquetSerializerConfig {
     }
 }
 
+/// Schema mode for Parquet serialization
+#[derive(Clone, Debug)]
+enum SchemaMode {
+    /// Use pre-defined explicit schema
+    Explicit {
+        schema: Arc<Schema>,
+    },
+    /// Infer schema from each batch
+    Inferred {
+        exclude_columns: std::collections::BTreeSet<String>,
+        max_columns: usize,
+    },
+}
+
 /// Parquet batch serializer that holds the schema and writer configuration
 #[derive(Clone, Debug)]
 pub struct ParquetSerializer {
-    schema: Arc<Schema>,
+    schema_mode: SchemaMode,
     writer_properties: WriterProperties,
-    estimated_output_size: Option<usize>,
 }
 
 impl ParquetSerializer {
     /// Create a new ParquetSerializer with the given configuration
     pub fn new(config: ParquetSerializerConfig) -> Result<Self, vector_common::Error> {
-        let schema_def = config.schema.ok_or_else(|| {
-            vector_common::Error::from(
-                "Parquet serializer requires a schema. Specify 'schema' in the configuration."
-            )
-        })?;
-
-        // Convert SchemaDefinition to Arrow Schema
-        let mut schema = schema_def
-            .to_arrow_schema()
-            .map_err(|e| vector_common::Error::from(e.to_string()))?;
-
-        // If allow_nullable_fields is enabled, transform the schema once here
-        // instead of on every batch encoding
-        if config.allow_nullable_fields {
-            schema = Arc::new(Schema::new_with_metadata(
-                schema
-                    .fields()
-                    .iter()
-                    .map(|f| Arc::new(super::arrow::make_field_nullable(f)))
-                    .collect::<Vec<_>>(),
-                schema.metadata().clone(),
-            ));
-        }
+        // Validate configuration
+        config.validate()
+            .map_err(|e| vector_common::Error::from(e))?;
+
+        // Keep a copy of schema_def for later use with Bloom filters
+        let schema_def_opt = config.schema.clone();
+
+        // Determine schema mode
+        let schema_mode = if config.infer_schema {
+            SchemaMode::Inferred {
+                exclude_columns: config.exclude_columns
+                    .unwrap_or_default()
+                    .into_iter()
+                    .collect(),
+                max_columns: config.max_columns,
+            }
+        } else {
+            let schema_def = config.schema.ok_or_else(|| {
+                vector_common::Error::from("Schema required when infer_schema is false")
+            })?;
+
+            // Convert SchemaDefinition to Arrow Schema
+            let mut schema = schema_def
+                .to_arrow_schema()
+                .map_err(|e| vector_common::Error::from(e.to_string()))?;
+
+            // If allow_nullable_fields is enabled, transform the schema once here
+            if config.allow_nullable_fields {
+                schema = Arc::new(Schema::new_with_metadata(
+                    schema
+                        .fields()
+                        .iter()
+                        .map(|f| Arc::new(super::arrow::make_field_nullable(f)))
+                        .collect::<Vec<_>>(),
+                    schema.metadata().clone(),
+                ));
+            }
+
+            SchemaMode::Explicit { schema }
+        };
 
         // Build writer properties
+        let compression = config.compression.to_compression(config.compression_level)
+            .map_err(|e| vector_common::Error::from(e))?;
+
+        tracing::debug!(
+            compression = ?config.compression,
+            compression_level = ?config.compression_level,
+            writer_version = ?config.writer_version,
+            infer_schema = config.infer_schema,
+            "Configuring Parquet writer properties"
+        );
+
         let mut props_builder = WriterProperties::builder()
-            .set_compression(config.compression.into());
+            .set_compression(compression)
+            .set_writer_version(config.writer_version.into());
 
         if let Some(row_group_size) = config.row_group_size {
             props_builder = props_builder.set_max_row_group_size(row_group_size);
         }
 
-        // Enable Bloom filters if configured
-        if config.enable_bloom_filters {
-            props_builder = props_builder
-                .set_bloom_filter_enabled(true)
-                .set_bloom_filter_fpp(config.bloom_filter_fpp)
-                .set_bloom_filter_ndv(config.bloom_filter_ndv);
+        // Only apply Bloom filters and sorting for explicit schema mode
+        if let (SchemaMode::Explicit { schema }, Some(schema_def)) = (&schema_mode, &schema_def_opt) {
+
+            // Apply per-column Bloom filter settings from schema
+            let bloom_filter_configs = schema_def.extract_bloom_filter_configs();
+            for bloom_config in bloom_filter_configs {
+                if let Some(col_idx) = schema
+                    .fields()
+                    .iter()
+                    .position(|f| f.name() == &bloom_config.column_name)
+                {
+                    // Use field-specific settings or sensible defaults
+                    let fpp = bloom_config.fpp.unwrap_or(0.05); // Default 5% false positive rate
+                    let mut ndv = bloom_config.ndv.unwrap_or(1_000_000); // Default 1M distinct values
+
+                    // Cap NDV to row group size (can't have more distinct values than total rows)
+                    if let Some(row_group_size) = config.row_group_size {
+                        ndv = ndv.min(row_group_size as u64);
+                    }
+
+                    let column_path = ColumnPath::from(schema.field(col_idx).name().as_str());
+                    props_builder = props_builder
+                        .set_column_bloom_filter_enabled(column_path.clone(), true)
+                        .set_column_bloom_filter_fpp(column_path.clone(), fpp)
+                        .set_column_bloom_filter_ndv(column_path, ndv);
+                }
+            }
+
+            // Set sorting columns if configured
+            if let Some(sorting_cols) = &config.sorting_columns {
+                use parquet::format::SortingColumn;
+
+                let parquet_sorting_cols: Vec<SortingColumn> = sorting_cols
+                    .iter()
+                    .map(|col| {
+                        let col_idx = schema
+                            .fields()
+                            .iter()
+                            .position(|f| f.name() == &col.column)
+                            .ok_or_else(|| {
+                                vector_common::Error::from(format!(
+                                    "Sorting column '{}' not found in schema",
+                                    col.column
+                                ))
+                            })?;
+
+                        Ok(SortingColumn::new(col_idx as i32, col.descending, false))
+                    })
+                    .collect::<Result<Vec<_>, vector_common::Error>>()?;
+
+                props_builder = props_builder.set_sorting_columns(Some(parquet_sorting_cols));
+            }
         }
+        // Note: Bloom filters and sorting are NOT applied for inferred schemas
 
         let writer_properties = props_builder.build();
 
         Ok(Self {
-            schema,
+            schema_mode,
             writer_properties,
-            estimated_output_size: config.estimated_output_size,
         })
     }
 }
@@ -298,12 +551,16 @@ impl tokio_util::codec::Encoder<Vec<Event>> for ParquetSerializer {
             return Err(ParquetEncodingError::NoEvents);
         }
 
-        let bytes = encode_events_to_parquet(
-            &events,
-            Arc::clone(&self.schema),
-            &self.writer_properties,
-            self.estimated_output_size,
-        )?;
+        // Determine schema based on mode
+        let schema = match &self.schema_mode {
+            SchemaMode::Explicit { schema } => Arc::clone(schema),
+            SchemaMode::Inferred {
+                exclude_columns,
+                max_columns,
+            } => infer_schema_from_events(&events, exclude_columns, *max_columns)?,
+        };
+
+        let bytes = encode_events_to_parquet(&events, schema, &self.writer_properties)?;
 
         // Use put() instead of extend_from_slice to avoid copying when possible
         buffer.put(bytes);
@@ -336,6 +593,21 @@ pub enum ParquetEncodingError {
     #[snafu(display("Schema must be provided before encoding"))]
     NoSchemaProvided,
 
+    /// No fields could be inferred from events
+    #[snafu(display("No fields could be inferred from events (all fields excluded or only null values)"))]
+    NoFieldsInferred,
+
+    /// Invalid event type (not a log event)
+    #[snafu(display("Invalid event type, expected log event"))]
+    InvalidEventType,
+
+    /// JSON serialization error for nested types
+    #[snafu(display("Failed to serialize nested type as JSON: {}", source))]
+    JsonSerialization {
+        /// The underlying JSON error
+        source: serde_json::Error,
+    },
+
     /// IO error during encoding
     #[snafu(display("IO error: {}", source))]
     Io {
@@ -362,12 +634,123 @@ impl From<parquet::errors::ParquetError> for ParquetEncodingError {
     }
 }
 
+impl From<serde_json::Error> for ParquetEncodingError {
+    fn from(error: serde_json::Error) -> Self {
+        Self::JsonSerialization { source: error }
+    }
+}
+
+/// Infer Arrow DataType from a Vector Value
+fn infer_arrow_type(value: &vector_core::event::Value) -> arrow::datatypes::DataType {
+    use vector_core::event::Value;
+    use arrow::datatypes::{DataType, TimeUnit};
+
+    match value {
+        Value::Bytes(_) => DataType::Utf8,
+        Value::Integer(_) => DataType::Int64,
+        Value::Float(_) => DataType::Float64,
+        Value::Boolean(_) => DataType::Boolean,
+        Value::Timestamp(_) => DataType::Timestamp(TimeUnit::Microsecond, None),
+        // Nested types and regex are always serialized as strings
+        Value::Array(_) | Value::Object(_) | Value::Regex(_) => DataType::Utf8,
+        // Null doesn't determine type, default to Utf8
+        Value::Null => DataType::Utf8,
+    }
+}
+
+/// Infer schema from a batch of events
+fn infer_schema_from_events(
+    events: &[Event],
+    exclude_columns: &std::collections::BTreeSet<String>,
+    max_columns: usize,
+) -> Result<Arc<Schema>, ParquetEncodingError> {
+    use std::collections::BTreeMap;
+    use arrow::datatypes::{DataType, Field};
+    use vector_core::event::Value;
+
+    let mut field_types: BTreeMap<String, DataType> = BTreeMap::new();
+    let mut type_conflicts: BTreeMap<String, Vec<DataType>> = BTreeMap::new();
+
+    for event in events {
+        // Only process log events
+        let log = match event {
+            Event::Log(log) => log,
+            _ => return Err(ParquetEncodingError::InvalidEventType),
+        };
+
+        let fields_iter = log.all_event_fields().ok_or(ParquetEncodingError::InvalidEventType)?;
+
+        for (key, value) in fields_iter {
+            let key_str = key.to_string();
+
+            // Skip excluded columns
+            if exclude_columns.contains(&key_str) {
+                continue;
+            }
+
+            // Skip Value::Null (doesn't determine type)
+            if matches!(value, Value::Null) {
+                continue;
+            }
+
+            // Enforce max columns (skip new fields after limit)
+            if field_types.len() >= max_columns && !field_types.contains_key(&key_str) {
+                tracing::debug!(
+                    column = %key_str,
+                    max_columns = max_columns,
+                    "Skipping column: max_columns limit reached"
+                );
+                continue;
+            }
+
+            let inferred_type = infer_arrow_type(&value);
+
+            match field_types.get(&key_str) {
+                None => {
+                    // First occurrence of this field
+                    field_types.insert(key_str, inferred_type);
+                }
+                Some(existing_type) if existing_type != &inferred_type => {
+                    // Type conflict detected - fallback to Utf8
+                    tracing::warn!(
+                        column = %key_str,
+                        existing_type = ?existing_type,
+                        new_type = ?inferred_type,
+                        "Type conflict detected, encoding as Utf8"
+                    );
+
+                    type_conflicts
+                        .entry(key_str.clone())
+                        .or_insert_with(|| vec![existing_type.clone()])
+                        .push(inferred_type);
+
+                    field_types.insert(key_str, DataType::Utf8);
+                }
+                Some(_) => {
+                    // Same type, no action needed
+                }
+            }
+        }
+    }
+
+    if field_types.is_empty() {
+        return Err(ParquetEncodingError::NoFieldsInferred);
+    }
+
+    // Build Arrow schema (all fields nullable)
+    let arrow_fields: Vec<Arc<Field>> = field_types
+        .into_iter()
+        .map(|(name, dtype)| Arc::new(Field::new(name, dtype, true)))
+        .collect();
+
+    Ok(Arc::new(Schema::new(arrow_fields)))
+}
+
 /// Encodes a batch of events into Parquet format
 pub fn encode_events_to_parquet(
     events: &[Event],
     schema: Arc<Schema>,
     writer_properties: &WriterProperties,
-    estimated_output_size: Option<usize>,
 ) -> Result<Bytes, ParquetEncodingError> {
     if events.is_empty() {
         return Err(ParquetEncodingError::NoEvents);
@@ -379,20 +762,8 @@ pub fn encode_events_to_parquet(
     // Get batch metadata before we move into writer scope
     let batch_schema = record_batch.schema();
 
-    // Calculate buffer capacity to avoid reallocations
-    // This is critical for memory efficiency with large batches
-    let buffer_capacity = estimated_output_size.unwrap_or_else(|| {
-        // Heuristic: Estimate based on number of events and fields
-        // Assuming average 2KB per event after compression (conservative estimate)
-        // Users should tune estimated_output_size based on actual data for best results
-        let estimated_size = events.len() * 2048;
-
-        // Cap at reasonable maximum to avoid over-allocation for small batches
-        estimated_size.min(128 * 1024 * 1024) // Cap at 128MB
-    });
-
-    // Write RecordBatch to Parquet format in memory with pre-allocated buffer
-    let mut buffer = Vec::with_capacity(buffer_capacity);
+    // Write RecordBatch to Parquet format in memory
+    let mut buffer = Vec::new();
     {
         let mut writer = ArrowWriter::try_new(
             &mut buffer,
@@ -401,17 +772,9 @@ pub fn encode_events_to_parquet(
         )?;
 
         writer.write(&record_batch)?;
-
-        // Explicitly drop RecordBatch to release Arrow array memory immediately
-        drop(record_batch);
-
-        // close() consumes the writer, releasing compression buffers
         writer.close()?;
     }
 
-    // Shrink buffer to actual size to free excess pre-allocated capacity
-    buffer.shrink_to_fit();
-
     Ok(Bytes::from(buffer))
 }
 
@@ -651,19 +1014,30 @@ mod tests {
     #[test]
     fn test_parquet_serializer_config() {
         use std::collections::BTreeMap;
-
-        let mut schema_map = BTreeMap::new();
-        schema_map.insert("field".to_string(), "int64".to_string());
+        use super::schema_definition::FieldDefinition;
+
+        let mut fields = BTreeMap::new();
+        fields.insert(
+            "field".to_string(),
+            FieldDefinition {
+                r#type: "int64".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
 
         let config = ParquetSerializerConfig {
-            schema: Some(SchemaDefinition::Simple(schema_map)),
+            schema: Some(SchemaDefinition { fields }),
+            infer_schema: false,
+            exclude_columns: None,
+            max_columns: default_max_columns(),
             compression: ParquetCompression::Zstd,
+            compression_level: None,
+            writer_version: ParquetWriterVersion::default(),
             row_group_size: Some(1000),
             allow_nullable_fields: false,
-            estimated_output_size: None,
-            enable_bloom_filters: false,
-            bloom_filter_fpp: default_bloom_fpp(),
-            bloom_filter_ndv: default_bloom_ndv(),
+            sorting_columns: None,
         };
 
         let serializer = ParquetSerializer::new(config);
@@ -674,13 +1048,15 @@ mod tests {
     fn test_parquet_serializer_no_schema_fails() {
         let config = ParquetSerializerConfig {
             schema: None,
+            infer_schema: false,
+            exclude_columns: None,
+            max_columns: default_max_columns(),
             compression: ParquetCompression::default(),
+            compression_level: None,
+            writer_version: ParquetWriterVersion::default(),
             row_group_size: None,
             allow_nullable_fields: false,
-            estimated_output_size: None,
-            enable_bloom_filters: false,
-            bloom_filter_fpp: default_bloom_fpp(),
-            bloom_filter_ndv: default_bloom_ndv(),
+            sorting_columns: None,
         };
 
         let result = ParquetSerializer::new(config);
@@ -691,12 +1067,29 @@ mod tests {
     fn test_encoder_trait_implementation() {
         use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
+        use super::schema_definition::FieldDefinition;
+
+        let mut fields = BTreeMap::new();
+        fields.insert(
+            "id".to_string(),
+            FieldDefinition {
+                r#type: "int64".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+        fields.insert(
+            "name".to_string(),
+            FieldDefinition {
+                r#type: "utf8".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
 
-        let mut schema_map = BTreeMap::new();
-        schema_map.insert("id".to_string(), "int64".to_string());
-        schema_map.insert("name".to_string(), "utf8".to_string());
-
-        let config = ParquetSerializerConfig::new(SchemaDefinition::Simple(schema_map));
+        let config = ParquetSerializerConfig::new(SchemaDefinition { fields });
         let mut serializer = ParquetSerializer::new(config).unwrap();
 
         let mut log = LogEvent::default();
@@ -756,9 +1149,18 @@ mod tests {
     fn test_allow_nullable_fields_config() {
         use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
-
-        let mut schema_map = BTreeMap::new();
-        schema_map.insert("required_field".to_string(), "int64".to_string());
+        use super::schema_definition::FieldDefinition;
+
+        let mut fields = BTreeMap::new();
+        fields.insert(
+            "required_field".to_string(),
+            FieldDefinition {
+                r#type: "int64".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
 
         let mut log1 = LogEvent::default();
         log1.insert("required_field", 42);
@@ -770,7 +1172,7 @@ mod tests {
 
         // Note: SchemaDefinition creates nullable fields by default
         // This test verifies that the allow_nullable_fields flag works
-        let mut config = ParquetSerializerConfig::new(SchemaDefinition::Simple(schema_map));
+        let mut config = ParquetSerializerConfig::new(SchemaDefinition { fields });
         config.allow_nullable_fields = true;
 
         let mut serializer = ParquetSerializer::new(config).unwrap();
diff --git a/lib/codecs/src/encoding/format/schema_definition.rs b/lib/codecs/src/encoding/format/schema_definition.rs
index 0c98788907f7a..a1f66aa4d43fe 100644
--- a/lib/codecs/src/encoding/format/schema_definition.rs
+++ b/lib/codecs/src/encoding/format/schema_definition.rs
@@ -21,32 +21,106 @@ pub enum SchemaDefinitionError {
     },
 }
 
+/// Per-column configuration including type and Bloom filter settings
+#[configurable_component]
+#[derive(Debug, Clone)]
+pub struct FieldDefinition {
+    /// Data type for this field
+    #[configurable(metadata(docs::examples = "utf8"))]
+    #[configurable(metadata(docs::examples = "int64"))]
+    #[configurable(metadata(docs::examples = "timestamp_ms"))]
+    pub r#type: String,
+
+    /// Enable Bloom filter for this specific column
+    ///
+    /// When enabled, a Bloom filter will be created for this column to improve
+    /// query performance for point lookups and IN clauses. Only enable for
+    /// high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+    #[serde(default)]
+    #[configurable(metadata(docs::examples = true))]
+    pub bloom_filter: bool,
+
+    /// Number of distinct values expected for this column's Bloom filter
+    ///
+    /// This controls the size of the Bloom filter. Should match the actual
+    /// cardinality of the column. Will be automatically capped to the batch size.
+    ///
+    /// - Low cardinality (countries, states): 1,000 - 100,000
+    /// - Medium cardinality (cities, products): 100,000 - 1,000,000
+    /// - High cardinality (UUIDs, user IDs): 10,000,000+
+    #[serde(default, alias = "bloom_filter_ndv")]
+    #[configurable(metadata(docs::examples = 1000000))]
+    #[configurable(metadata(docs::examples = 10000000))]
+    pub bloom_filter_num_distinct_values: Option<u64>,
+
+    /// False positive probability for this column's Bloom filter (as a percentage)
+    ///
+    /// Lower values create larger but more accurate filters.
+    ///
+    /// - 0.05 (5%): Good balance for general use
+    /// - 0.01 (1%): Better for high-selectivity queries
+    #[serde(default, alias = "bloom_filter_fpp")]
+    #[configurable(metadata(docs::examples = 0.05))]
+    #[configurable(metadata(docs::examples = 0.01))]
+    pub bloom_filter_false_positive_pct: Option<f64>,
+}
+
+/// Bloom filter configuration for a specific column
+#[derive(Debug, Clone)]
+pub struct ColumnBloomFilterConfig {
+    /// Column name
+    pub column_name: String,
+    /// Whether Bloom filter is enabled for this column
+    pub enabled: bool,
+    /// Number of distinct values (if specified)
+    pub ndv: Option<u64>,
+    /// False positive probability (if specified)
+    pub fpp: Option<f64>,
+}
+
 /// A schema definition that can be deserialized from configuration
 #[configurable_component]
 #[derive(Debug, Clone)]
-#[serde(untagged)]
-pub enum SchemaDefinition {
-    /// Simple map of field names to type names
-    Simple(BTreeMap<String, String>),
+pub struct SchemaDefinition {
+    /// Map of field names to their type and Bloom filter configuration
+    #[serde(flatten)]
+    #[configurable(metadata(docs::additional_props_description = "A field definition specifying the data type and optional Bloom filter configuration."))]
+    pub fields: BTreeMap<String, FieldDefinition>,
 }
 
 impl SchemaDefinition {
     /// Convert the schema definition to an Arrow Schema
     pub fn to_arrow_schema(&self) -> Result<Arc<Schema>, SchemaDefinitionError> {
-        match self {
-            SchemaDefinition::Simple(fields) => {
-                let arrow_fields: Result<Vec<_>, _> = fields
-                    .iter()
-                    .map(|(name, type_str)| {
-                        let data_type = parse_data_type(type_str, name)?;
-                        // All fields are nullable by default when defined in config
-                        Ok(Arc::new(Field::new(name, data_type, true)))
-                    })
-                    .collect();
+        let arrow_fields: Result<Vec<_>, _> = self
+            .fields
+            .iter()
+            .map(|(name, field_def)| {
+                let data_type = parse_data_type(&field_def.r#type, name)?;
+                // All fields are nullable by default when defined in config
+                Ok(Arc::new(Field::new(name, data_type, true)))
+            })
+            .collect();
 
-                Ok(Arc::new(Schema::new(arrow_fields?)))
-            }
-        }
+        Ok(Arc::new(Schema::new(arrow_fields?)))
+    }
+
+    /// Extract per-column Bloom filter configurations
+    pub fn extract_bloom_filter_configs(&self) -> Vec<ColumnBloomFilterConfig> {
+        self.fields
+            .iter()
+            .filter_map(|(name, field_def)| {
+                if field_def.bloom_filter {
+                    Some(ColumnBloomFilterConfig {
+                        column_name: name.clone(),
+                        enabled: true,
+                        ndv: field_def.bloom_filter_num_distinct_values,
+                        fpp: field_def.bloom_filter_false_positive_pct,
+                    })
+                } else {
+                    None
+                }
+            })
+            .collect()
     }
 }
 
@@ -136,11 +210,35 @@ mod tests {
     #[test]
     fn test_simple_schema_definition() {
         let mut fields = BTreeMap::new();
-        fields.insert("id".to_string(), "int64".to_string());
-        fields.insert("name".to_string(), "utf8".to_string());
-        fields.insert("value".to_string(), "float64".to_string());
+        fields.insert(
+            "id".to_string(),
+            FieldDefinition {
+                r#type: "int64".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+        fields.insert(
+            "name".to_string(),
+            FieldDefinition {
+                r#type: "utf8".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+        fields.insert(
+            "value".to_string(),
+            FieldDefinition {
+                r#type: "float64".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
 
-        let schema_def = SchemaDefinition::Simple(fields);
+        let schema_def = SchemaDefinition { fields };
         let schema = schema_def.to_arrow_schema().unwrap();
 
         assert_eq!(schema.fields().len(), 3);
@@ -159,12 +257,44 @@ mod tests {
     #[test]
     fn test_timestamp_types() {
         let mut fields = BTreeMap::new();
-        fields.insert("ts_s".to_string(), "timestamp_second".to_string());
-        fields.insert("ts_ms".to_string(), "timestamp_millisecond".to_string());
-        fields.insert("ts_us".to_string(), "timestamp_microsecond".to_string());
-        fields.insert("ts_ns".to_string(), "timestamp_nanosecond".to_string());
+        fields.insert(
+            "ts_s".to_string(),
+            FieldDefinition {
+                r#type: "timestamp_second".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+        fields.insert(
+            "ts_ms".to_string(),
+            FieldDefinition {
+                r#type: "timestamp_millisecond".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+        fields.insert(
+            "ts_us".to_string(),
+            FieldDefinition {
+                r#type: "timestamp_microsecond".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+        fields.insert(
+            "ts_ns".to_string(),
+            FieldDefinition {
+                r#type: "timestamp_nanosecond".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
 
-        let schema_def = SchemaDefinition::Simple(fields);
+        let schema_def = SchemaDefinition { fields };
         let schema = schema_def.to_arrow_schema().unwrap();
 
         assert_eq!(
@@ -188,9 +318,17 @@ mod tests {
     #[test]
     fn test_unknown_data_type() {
         let mut fields = BTreeMap::new();
-        fields.insert("bad_field".to_string(), "unknown_type".to_string());
+        fields.insert(
+            "bad_field".to_string(),
+            FieldDefinition {
+                r#type: "unknown_type".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
 
-        let schema_def = SchemaDefinition::Simple(fields);
+        let schema_def = SchemaDefinition { fields };
         let result = schema_def.to_arrow_schema();
 
         assert!(result.is_err());
@@ -198,4 +336,59 @@ mod tests {
         assert!(err.to_string().contains("unknown_type"));
     }
 
+    #[test]
+    fn test_bloom_filter_extraction() {
+        let mut fields = BTreeMap::new();
+        fields.insert(
+            "id".to_string(),
+            FieldDefinition {
+                r#type: "int64".to_string(),
+                bloom_filter: false,
+                bloom_filter_num_distinct_values: None,
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+        fields.insert(
+            "user_id".to_string(),
+            FieldDefinition {
+                r#type: "utf8".to_string(),
+                bloom_filter: true,
+                bloom_filter_num_distinct_values: Some(10_000_000),
+                bloom_filter_false_positive_pct: Some(0.01),
+            },
+        );
+        fields.insert(
+            "request_id".to_string(),
+            FieldDefinition {
+                r#type: "utf8".to_string(),
+                bloom_filter: true,
+                bloom_filter_num_distinct_values: None, // Will use global default
+                bloom_filter_false_positive_pct: None,
+            },
+        );
+
+        let schema_def = SchemaDefinition { fields };
+        let bloom_configs = schema_def.extract_bloom_filter_configs();
+
+        assert_eq!(bloom_configs.len(), 2);
+
+        // Check user_id config
+        let user_id_config = bloom_configs
+            .iter()
+            .find(|c| c.column_name == "user_id")
+            .unwrap();
+        assert!(user_id_config.enabled);
+        assert_eq!(user_id_config.ndv, Some(10_000_000));
+        assert_eq!(user_id_config.fpp, Some(0.01));
+
+        // Check request_id config
+        let request_id_config = bloom_configs
+            .iter()
+            .find(|c| c.column_name == "request_id")
+            .unwrap();
+        assert!(request_id_config.enabled);
+        assert_eq!(request_id_config.ndv, None);
+        assert_eq!(request_id_config.fpp, None);
+    }
+
 }
diff --git a/lib/codecs/src/encoding/serializer.rs b/lib/codecs/src/encoding/serializer.rs
index 1982f6af43ef0..4d1d50d186846 100644
--- a/lib/codecs/src/encoding/serializer.rs
+++ b/lib/codecs/src/encoding/serializer.rs
@@ -122,7 +122,10 @@ pub enum SerializerConfig {
     ///
     /// [apache_parquet]: https://parquet.apache.org/
     #[cfg(feature = "parquet")]
-    Parquet(ParquetSerializerConfig),
+    Parquet {
+        /// Apache Parquet-specific encoder options.
+        parquet: ParquetSerializerConfig,
+    },
 
     /// No encoding.
     ///
@@ -176,7 +179,10 @@ pub enum BatchSerializerConfig {
     ///
     /// [apache_parquet]: https://parquet.apache.org/
     #[cfg(feature = "parquet")]
-    Parquet(ParquetSerializerConfig),
+    Parquet {
+        /// Apache Parquet-specific encoder options.
+        parquet: ParquetSerializerConfig,
+    },
 }
 
 #[cfg(any(feature = "arrow", feature = "parquet"))]
@@ -187,7 +193,7 @@ impl BatchSerializerConfig {
             #[cfg(feature = "arrow")]
             BatchSerializerConfig::ArrowStream(arrow_config) => arrow_config.input_type(),
             #[cfg(feature = "parquet")]
-            BatchSerializerConfig::Parquet(parquet_config) => parquet_config.input_type(),
+            BatchSerializerConfig::Parquet { parquet } => parquet.input_type(),
         }
     }
 
@@ -197,7 +203,7 @@ impl BatchSerializerConfig {
             #[cfg(feature = "arrow")]
             BatchSerializerConfig::ArrowStream(arrow_config) => arrow_config.schema_requirement(),
             #[cfg(feature = "parquet")]
-            BatchSerializerConfig::Parquet(parquet_config) => parquet_config.schema_requirement(),
+            BatchSerializerConfig::Parquet { parquet } => parquet.schema_requirement(),
         }
     }
 }
@@ -301,7 +307,7 @@ impl SerializerConfig {
             }
             SerializerConfig::Text(config) => Ok(Serializer::Text(config.build())),
             #[cfg(feature = "parquet")]
-            SerializerConfig::Parquet(_) => Err(
+            SerializerConfig::Parquet { .. } => Err(
                 VectorError::from(
                     "Parquet codec is available only for batch encoding and cannot be built as a framed serializer.",
                 )
@@ -343,7 +349,7 @@ impl SerializerConfig {
                 FramingConfig::CharacterDelimited(CharacterDelimitedEncoderConfig::new(0))
             }
             #[cfg(feature = "parquet")]
-            SerializerConfig::Parquet(_) => FramingConfig::NewlineDelimited,
+            SerializerConfig::Parquet { .. } => FramingConfig::NewlineDelimited,
         }
     }
 
@@ -364,7 +370,7 @@ impl SerializerConfig {
         SerializerConfig::Otlp => OtlpSerializerConfig::default().input_type(),
         SerializerConfig::Protobuf(config) => config.input_type(),
         #[cfg(feature = "parquet")]
-        SerializerConfig::Parquet(config) => config.input_type(),
+        SerializerConfig::Parquet { parquet } => parquet.input_type(),
         SerializerConfig::RawMessage => RawMessageSerializerConfig.input_type(),
         SerializerConfig::Text(config) => config.input_type(),
     }
@@ -387,7 +393,7 @@ impl SerializerConfig {
         SerializerConfig::Otlp => OtlpSerializerConfig::default().schema_requirement(),
         SerializerConfig::Protobuf(config) => config.schema_requirement(),
         #[cfg(feature = "parquet")]
-        SerializerConfig::Parquet(config) => config.schema_requirement(),
+        SerializerConfig::Parquet { parquet } => parquet.schema_requirement(),
         SerializerConfig::RawMessage => RawMessageSerializerConfig.schema_requirement(),
         SerializerConfig::Text(config) => config.schema_requirement(),
     }
diff --git a/src/codecs/encoding/config.rs b/src/codecs/encoding/config.rs
index f1c6a8bcad78e..c87d5c5d6f28c 100644
--- a/src/codecs/encoding/config.rs
+++ b/src/codecs/encoding/config.rs
@@ -147,8 +147,8 @@ impl EncodingConfigWithFraming {
     pub fn build_encoder(&self, sink_type: SinkType) -> crate::Result<(Transformer, EncoderKind)> {
         match &self.encoding.encoding {
             #[cfg(feature = "codecs-parquet")]
-            SerializerConfig::Parquet(parquet_config) => {
-                let serializer = ParquetSerializer::new(parquet_config.clone())?;
+            SerializerConfig::Parquet { parquet } => {
+                let serializer = ParquetSerializer::new(parquet.clone())?;
                 let encoder = EncoderKind::Batch(BatchEncoder::new(BatchSerializer::Parquet(
                     serializer,
                 )));
diff --git a/src/components/validation/resources/mod.rs b/src/components/validation/resources/mod.rs
index f72116111c93d..3ab2d8f86010d 100644
--- a/src/components/validation/resources/mod.rs
+++ b/src/components/validation/resources/mod.rs
@@ -240,7 +240,7 @@ fn serializer_config_to_deserializer(
         #[cfg(feature = "codecs-opentelemetry")]
         SerializerConfig::Otlp => todo!(),
         #[cfg(feature = "codecs-parquet")]
-        SerializerConfig::Parquet(_) => DeserializerConfig::Bytes, // Parquet files are binary
+        SerializerConfig::Parquet { .. } => DeserializerConfig::Bytes, // Parquet files are binary
     };
 
     deserializer_config.build()
diff --git a/website/cue/reference/components/sinks/aws_s3.cue b/website/cue/reference/components/sinks/aws_s3.cue
index 17c51f1688697..529a9367c36b9 100644
--- a/website/cue/reference/components/sinks/aws_s3.cue
+++ b/website/cue/reference/components/sinks/aws_s3.cue
@@ -113,14 +113,38 @@ components: sinks: aws_s3: components._aws & {
 
 				## Schema Configuration
 
-				When using Parquet encoding, you **must** specify a schema that defines the structure and
-				types of the Parquet file columns. The schema is defined as a simple map of field names to
-				data types. Vector events are converted to Arrow RecordBatches and then written as Parquet files.
+				Vector supports two approaches for defining the Parquet schema:
 
-				All fields defined in the schema are nullable by default, meaning missing fields will be encoded
-				as NULL values in the Parquet file.
+				1. **Explicit Schema**: Define the exact structure and data types for your Parquet files
+				2. **Automatic Schema Inference**: Let Vector automatically infer the schema from your event data
+
+				You must choose exactly one approach - they are mutually exclusive.
+
+				### Automatic Schema Inference (Recommended for Getting Started)
+
+				When enabled, Vector automatically infers the schema from each batch of events by examining
+				the data types of values in the events. This is the easiest way to get started with Parquet
+				encoding.
+
+				**Type mapping:**
+				- String values → `utf8`
+				- Integer values → `int64`
+				- Float values → `float64`
+				- Boolean values → `boolean`
+				- Timestamp values → `timestamp_microsecond`
+				- Arrays/Objects → `utf8` (serialized as JSON)
 
-				**Example configuration:**
+				**Type conflicts:** If a field has different types across events in the same batch,
+				it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+				**Important:** Schema consistency across batches is the operator's responsibility.
+				Use VRL transforms to ensure consistent types if needed. Each batch may produce
+				a different schema if event structure varies.
+
+				**Limitations:** Bloom filters and sorting are not supported with automatic schema inference.
+				Use explicit schema if you need these features.
+
+				**Example configuration with schema inference:**
 
 				```yaml
 				sinks:
@@ -133,32 +157,87 @@ components: sinks: aws_s3: components._aws & {
 				      timeout_secs: 60
 				    encoding:
 				      codec: parquet
-				      schema:
-				        # Timestamps
-				        timestamp: timestamp_microsecond
-				        created_at: timestamp_millisecond
+				      parquet:
+				        infer_schema: true
+				        exclude_columns:
+				          - _metadata
+				          - internal_id
+				        max_columns: 1000
+				        compression: zstd
+				        compression_level: 6
+				        writer_version: v2
+				        row_group_size: 50000
+				```
+
+				### Explicit Schema (Recommended for Production)
 
-				        # String fields
-				        user_id: utf8
-				        event_name: utf8
-				        message: utf8
+				For production use, explicitly defining the schema provides better control, consistency,
+				and access to advanced features like per-column Bloom filters and sorting. The schema
+				is defined as a map of field names to field definitions.
 
-				        # Numeric fields
-				        team_id: int64
-				        duration_ms: float64
-				        count: int32
+				All fields defined in the schema are nullable by default, meaning missing fields will be encoded
+				as NULL values in the Parquet file.
 
-				        # Boolean
-				        is_active: boolean
+				**Example configuration with explicit schema:**
 
+				```yaml
+				sinks:
+				  s3:
+				    type: aws_s3
+				    bucket: my-bucket
+				    compression: none  # Parquet handles compression internally
+				    batch:
+				      max_events: 50000
+				      timeout_secs: 60
+				    encoding:
+				      codec: parquet
 				      parquet:
+				        schema:
+				          # Timestamps
+				          timestamp:
+				            type: timestamp_microsecond
+				            bloom_filter: false
+				          created_at:
+				            type: timestamp_millisecond
+				            bloom_filter: false
+
+				          # String fields with per-column Bloom filters
+				          user_id:
+				            type: utf8
+				            bloom_filter: true  # Enable for high-cardinality field
+				            bloom_filter_num_distinct_values: 10000000
+				            bloom_filter_false_positive_pct: 0.01
+				          event_name:
+				            type: utf8
+				            bloom_filter: false
+				          message:
+				            type: utf8
+				            bloom_filter: false
+
+				          # Numeric fields
+				          team_id:
+				            type: int64
+				            bloom_filter: false
+				          duration_ms:
+				            type: float64
+				            bloom_filter: false
+				          count:
+				            type: int32
+				            bloom_filter: false
+
+				          # Boolean
+				          is_active:
+				            type: boolean
+				            bloom_filter: false
+
 				        compression: zstd
+				        compression_level: 6  # ZSTD level 1-22 (higher = better compression)
+				        writer_version: v2  # Use modern Parquet format
 				        row_group_size: 50000  # Should be <= batch.max_events
 				        allow_nullable_fields: true
-				        estimated_output_size: 10485760  # 10MB - tune based on your data
-				        enable_bloom_filters: true  # Enable for better query performance
-				        bloom_filter_fpp: 0.05  # 5% false positive rate
-				        bloom_filter_ndv: 1000000  # Expected distinct values
+				        sorting_columns:  # Pre-sort for better compression and queries
+				          - column: timestamp
+				            descending: true  # Most recent first
 				```
 
 				## Supported Data Types
@@ -196,7 +275,108 @@ components: sinks: aws_s3: components._aws & {
 
 				## Parquet Configuration Options
 
-				### compression
+				### Schema Options
+
+				#### schema
+
+				Explicitly define the Arrow schema for encoding events to Parquet. This schema defines
+				the structure and types of the Parquet file columns, specified as a map of field names
+				to field definitions.
+
+				Each field definition includes:
+				- **type**: The Arrow data type (required)
+				- **bloom_filter**: Enable Bloom filter for this column (optional, default: false)
+				- **bloom_filter_num_distinct_values**: Number of distinct values for this column's Bloom filter (optional)
+				- **bloom_filter_false_positive_pct**: False positive probability for this column's Bloom filter (optional)
+
+				All fields are nullable by default, meaning missing fields will be encoded as NULL values.
+
+				**Mutually exclusive with `infer_schema`**. You must specify either `schema` or
+				`infer_schema: true`, but not both.
+
+				**Example:**
+				```yaml
+				schema:
+				  user_id:
+				    type: utf8
+				    bloom_filter: true
+				    bloom_filter_num_distinct_values: 10000000
+				    bloom_filter_false_positive_pct: 0.01
+				  timestamp:
+				    type: timestamp_microsecond
+				    bloom_filter: false
+				  count:
+				    type: int64
+				    bloom_filter: false
+				```
+
+				#### infer_schema
+
+				Automatically infer the schema from event data. When enabled, Vector examines each
+				batch of events and automatically determines the appropriate Arrow data types based
+				on the values present.
+
+				**Type inference rules:**
+				- String values → `utf8`
+				- Integer values → `int64`
+				- Float values → `float64`
+				- Boolean values → `boolean`
+				- Timestamp values → `timestamp_microsecond`
+				- Arrays/Objects → `utf8` (serialized as JSON)
+				- Type conflicts → `utf8` (fallback to string with warning)
+
+				**Important considerations:**
+				- Schema may vary between batches if event structure changes
+				- Use VRL transforms to ensure type consistency if needed
+				- Bloom filters and sorting are not available with inferred schemas
+				- For production workloads, explicit schemas are recommended
+
+				**Mutually exclusive with `schema`**. You must specify either `schema` or
+				`infer_schema: true`, but not both.
+
+				**Default**: `false`
+
+				#### exclude_columns
+
+				Column names to exclude from Parquet encoding when using automatic schema inference.
+				These columns will be completely excluded from the Parquet file.
+
+				Useful for filtering out metadata, internal fields, or temporary data that shouldn't
+				be persisted to long-term storage.
+
+				**Only applies when `infer_schema` is enabled**. Ignored when using explicit schema
+				(use the schema definition to control which fields are included).
+
+				**Example:**
+				```yaml
+				infer_schema: true
+				exclude_columns:
+				  - _metadata
+				  - internal_id
+				  - temp_field
+				```
+
+				#### max_columns
+
+				Maximum number of columns to encode when using automatic schema inference. Additional
+				columns beyond this limit will be silently dropped. Columns are selected in the order
+				they appear in the first event.
+
+				This protects against accidentally creating Parquet files with too many columns, which
+				can cause performance issues in query engines.
+
+				**Only applies when `infer_schema` is enabled**. Ignored when using explicit schema.
+
+				**Default**: `1000`
+
+				**Recommended values:**
+				- Standard use cases: `1000` (default)
+				- Wide tables: `500` - `1000`
+				- Performance-critical: `100` - `500`
+
+				### Compression Options
+
+				#### compression
 
 				Compression algorithm applied to Parquet column data:
 				- `snappy` (default): Fast compression with moderate compression ratio
@@ -206,6 +386,48 @@ components: sinks: aws_s3: components._aws & {
 				- `brotli`: Good compression, web-optimized
 				- `uncompressed`: No compression
 
+				### compression_level
+
+				Compression level for algorithms that support it (ZSTD, GZIP, Brotli). This controls the
+				trade-off between compression ratio and encoding speed.
+
+				**ZSTD levels (1-22):**
+				- **1-3**: Fastest encoding, moderate compression (level 3 is default)
+				- **4-9**: Good balance of speed and compression
+				- **10-15**: Better compression, slower encoding (recommended for cold storage)
+				- **16-22**: Maximum compression, slowest encoding
+
+				**GZIP levels (1-9):**
+				- **1-3**: Faster encoding, less compression
+				- **6**: Default balance (recommended)
+				- **9**: Maximum compression, slowest
+
+				**Brotli levels (0-11):**
+				- **0-4**: Faster encoding
+				- **1**: Default (recommended)
+				- **5-11**: Better compression, slower
+
+				Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+				**Recommendation:** Use level 3-6 for hot data, 10-15 for cold storage.
+
+				### writer_version
+
+				Parquet format version to write. Controls compatibility vs. performance.
+
+				**Options:**
+				- **v1** (default): PARQUET_1_0 - Maximum compatibility with older readers
+				- **v2**: PARQUET_2_0 - Modern format with better encoding and statistics
+
+				**Version 2 benefits:**
+				- 10-20% more efficient encoding for certain data types
+				- Better statistics for query optimization
+				- Improved data page format
+				- Required for some advanced features
+
+				**When to use:**
+				- Use **v1** for maximum compatibility with pre-2018 tools
+				- Use **v2** for better performance with modern query engines (Athena, Spark, Presto)
+
 				### row_group_size
 
 				Number of rows per row group in the Parquet file. Row groups are Parquet's unit of
@@ -227,81 +449,105 @@ components: sinks: aws_s3: components._aws & {
 				would normally be non-nullable. This is useful when working with downstream systems that
 				can handle NULL values through defaults or computed columns.
 
-				### estimated_output_size
-
-				Estimated compressed output size in bytes for buffer pre-allocation. This is an optional
-				performance tuning parameter that can significantly reduce memory overhead by pre-allocating
-				the output buffer to an appropriate size, avoiding repeated reallocations during encoding.
+				### Per-Column Bloom Filters
 
-				**How to set this value:**
-				1. Monitor actual compressed Parquet file sizes in production
-				2. Set to approximately 1.2x your average observed compressed size for headroom
-				3. ZSTD compression typically achieves 3-10x compression on JSON/log data
+				Bloom filters are probabilistic data structures that can significantly improve query
+				performance by allowing query engines (like AWS Athena, Apache Spark, and Presto) to
+				skip entire row groups when searching for specific values without reading the actual data.
 
-				**Example:** If your batches are 100MB uncompressed and compress to 10MB on average,
-				set `estimated_output_size: 12582912` (12MB) to provide some headroom.
-
-				If not specified, Vector uses a heuristic based on estimated uncompressed size
-				(approximately 2KB per event, capped at 128MB).
-
-				**Trade-offs:**
-				- **Too small**: Minimal benefit, will still require reallocations
-				- **Too large**: Wastes memory by over-allocating
-				- **Just right**: Optimal memory usage with minimal reallocations
+				**Only available when using explicit schema** (not available with automatic schema inference).
 
-				### enable_bloom_filters
+				When using an explicit schema, you can enable Bloom filters on a per-column basis
+				by setting `bloom_filter: true` in the field definition. This gives you fine-grained
+				control over which columns get Bloom filters.
 
-				Enable Bloom filters for all columns in the Parquet file. Bloom filters are probabilistic
-				data structures that can significantly improve query performance by allowing query engines
-				(like AWS Athena, Apache Spark, and Presto) to skip entire row groups when searching for
-				specific values without reading the actual data.
-
-				**When to enable:**
+				**When to use Bloom filters:**
 				- High-cardinality columns: UUIDs, user IDs, session IDs, transaction IDs
 				- String columns frequently used in WHERE clauses: URLs, emails, tags, names
 				- Point queries: `WHERE user_id = 'abc123'`
 				- IN clause queries: `WHERE id IN ('x', 'y', 'z')`
 
+				**When NOT to use Bloom filters:**
+				- Low-cardinality columns (countries, status codes, boolean flags)
+				- Columns rarely used in WHERE clauses
+				- Range queries (Bloom filters don't help with `>`, `<`, `BETWEEN`)
+
 				**Trade-offs:**
-				- **Pros**: Significantly faster queries, better row group pruning, reduced I/O
+				- **Pros**: Significantly faster queries (often 10-100x), better row group pruning, reduced I/O
 				- **Cons**: Slightly larger file sizes (typically 1-5% overhead), minimal write overhead
 
-				**Default**: `false` (disabled)
+				**Configuration example:**
 
-				### bloom_filter_fpp
+				```yaml
+				schema:
+				  user_id:
+				    type: utf8
+				    bloom_filter: true              # Enable for high-cardinality column
+				    bloom_filter_num_distinct_values: 10000000      # Expected distinct values
+				    bloom_filter_false_positive_pct: 0.01          # 1% false positive rate
+				  event_name:
+				    type: utf8
+				    bloom_filter: false             # Skip for low-cardinality column
+				  timestamp:
+				    type: timestamp_microsecond
+				    bloom_filter: false             # Skip for timestamp (use sorting instead)
+				```
 
-				False positive probability (FPP) for Bloom filters. This controls the trade-off between
-				Bloom filter size and accuracy. Lower values produce larger but more accurate filters.
+				**Per-column Bloom filter settings:**
 
-				- **Default**: `0.05` (5% false positive rate)
-				- **Range**: Must be between 0.0 and 1.0 (exclusive)
-				- **Recommended values**:
+				- **bloom_filter**: Enable Bloom filter for this column (default: `false`)
+				- **bloom_filter_num_distinct_values**: Expected number of distinct values for this column's Bloom filter
+				  - Low cardinality (countries, states): `1,000` - `100,000`
+				  - Medium cardinality (cities, products): `100,000` - `1,000,000`
+				  - High cardinality (user IDs, UUIDs): `10,000,000+`
+				  - If not specified, defaults to `1,000,000`
+				  - Automatically capped to the `row_group_size` value
+				- **bloom_filter_false_positive_pct**: False positive probability for this column's Bloom filter
 				  - `0.05` (5%): Good balance for general use
 				  - `0.01` (1%): Better for high-selectivity queries where precision matters
 				  - `0.10` (10%): Smaller filters when storage is a concern
+				  - If not specified, defaults to `0.05`
 
 				A false positive means the Bloom filter indicates a value *might* be in a row group when it
 				actually isn't, requiring the engine to read and filter that row group. Lower FPP means fewer
-				unnecessary reads.
+				unnecessary reads but larger Bloom filters.
 
-				Only takes effect when `enable_bloom_filters` is `true`.
+				### sorting_columns
 
-				### bloom_filter_ndv
+				Pre-sort rows by specified columns before writing to Parquet. This can significantly improve
+				both compression ratios and query performance, especially for time-series data and event logs.
 
-				Estimated number of distinct values (NDV) for Bloom filter sizing. This should match the
-				expected cardinality of your columns. Higher values result in larger Bloom filters.
+				**Benefits:**
+				- **20-40% better compression**: Similar values are grouped together, improving compression
+				- **Faster queries**: More effective min/max statistics enable better row group skipping
+				- **Improved caching**: Query engines can cache sorted data more efficiently
 
-				- **Default**: `1,000,000`
-				- **Recommendation**: Analyze your data to determine actual cardinality
-				  - Low cardinality (countries, states): `1,000` - `100,000`
-				  - Medium cardinality (cities, products): `100,000` - `1,000,000`
-				  - High cardinality (user IDs, UUIDs): `10,000,000+`
+				**Common patterns:**
+				- **Time-series data**: Sort by `timestamp` descending (most recent first)
+				- **Multi-tenant systems**: Sort by `tenant_id`, then `timestamp`
+				- **User analytics**: Sort by `user_id`, then `event_time`
+				- **Logs**: Sort by `timestamp`, then `severity`
+
+				**Configuration:**
+				```yaml
+				sorting_columns:
+				  - column: timestamp
+				    descending: true   # Most recent first
+				  - column: user_id
+				    descending: false  # A-Z order
+				```
+
+				**Trade-offs:**
+				- **Write performance**: Adds 10-30% sorting overhead during encoding
+				- **Memory usage**: Requires buffering entire batch in memory for sorting
+				- **Most beneficial**: When queries frequently filter on sorted columns
 
-				**Important**: If your actual distinct value count significantly exceeds this number, the
-				false positive rate may increase beyond the configured `bloom_filter_fpp`, reducing query
-				performance gains.
+				**When to use:**
+				- Enable for time-series data where you query recent events frequently
+				- Enable for multi-tenant data partitioned by tenant_id
+				- Skip if write latency is critical and queries don't benefit from sorting
 
-				Only takes effect when `enable_bloom_filters` is `true`.
+				If not specified, rows are written in the order they appear in the batch.
 
 				## Batching Behavior
 
@@ -316,10 +562,12 @@ components: sinks: aws_s3: components._aws & {
 
 				- **Sink-level compression**: Set `compression: none` at the sink level since Parquet
 				  handles compression internally through its `parquet.compression` setting
-				- **All fields nullable**: Fields defined in the schema are nullable by default, allowing
-				  for missing values
-				- **Schema required**: The schema cannot be inferred and must be explicitly configured
-				- **AWS Athena compatibility**: Use `gzip` compression for best Athena compatibility
+				- **Schema configuration**: You must choose either explicit schema or automatic schema
+				  inference (`infer_schema: true`). For production use, explicit schemas are recommended
+				  for consistency and access to advanced features like Bloom filters and sorting
+				- **All fields nullable**: Fields defined in explicit schemas are nullable by default,
+				  allowing for missing values. Inferred schemas also create nullable fields
+				- **AWS Athena compatibility**: Use `gzip` or `snappy` compression for best Athena compatibility
 				"""
 		}
 
diff --git a/website/cue/reference/components/sinks/generated/amqp.cue b/website/cue/reference/components/sinks/generated/amqp.cue
index 66fb1312c5695..43b62c7ef3705 100644
--- a/website/cue/reference/components/sinks/generated/amqp.cue
+++ b/website/cue/reference/components/sinks/generated/amqp.cue
@@ -213,6 +213,15 @@ generated: components: sinks: amqp: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -376,6 +385,324 @@ generated: components: sinks: amqp: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue b/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue
index 12686c9b27b65..2e6ef6ad800c8 100644
--- a/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue
+++ b/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue
@@ -409,6 +409,15 @@ generated: components: sinks: aws_cloudwatch_logs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -572,6 +581,324 @@ generated: components: sinks: aws_cloudwatch_logs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue b/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue
index da311f458462e..cd0db2c964c77 100644
--- a/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue
+++ b/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue
@@ -388,6 +388,15 @@ generated: components: sinks: aws_kinesis_firehose: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -551,6 +560,324 @@ generated: components: sinks: aws_kinesis_firehose: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue b/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue
index 4a800fa6e35da..dcf0475017bfe 100644
--- a/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue
+++ b/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue
@@ -388,6 +388,15 @@ generated: components: sinks: aws_kinesis_streams: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -551,6 +560,324 @@ generated: components: sinks: aws_kinesis_streams: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_s3.cue b/website/cue/reference/components/sinks/generated/aws_s3.cue
index 65477e6ae498a..56cdb39d51a79 100644
--- a/website/cue/reference/components/sinks/generated/aws_s3.cue
+++ b/website/cue/reference/components/sinks/generated/aws_s3.cue
@@ -500,11 +500,9 @@ generated: components: sinks: aws_s3: configuration: {
 					parquet: """
 						Encodes events in [Apache Parquet][apache_parquet] columnar format.
 
-						Parquet is a columnar storage format optimized for analytics workloads. It provides
-						efficient compression and encoding schemes, making it ideal for long-term storage and
-						query performance with tools like AWS Athena, Apache Spark, and Presto.
-
-						This is a batch encoder that writes one Parquet file per batch with proper metadata and footers.
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
 
 						[apache_parquet]: https://parquet.apache.org/
 						"""
@@ -674,128 +672,317 @@ generated: components: sinks: aws_s3: configuration: {
 			parquet: {
 				description:   "Apache Parquet-specific encoder options."
 				relevant_when: "codec = \"parquet\""
-				required:      false
+				required:      true
 				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
 					compression: {
-						description: "Compression algorithm for Parquet columns."
-						required:    false
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
 						type: string: {
 							default: "snappy"
 							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
 								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								gzip:         "GZIP compression (balanced, good for AWS Athena)"
-								zstd:         "ZSTD compression (best compression ratio)"
-								lz4:          "LZ4 compression (very fast)"
-								brotli:       "Brotli compression (good compression)"
 								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
 							}
+							examples: ["snappy", "gzip", "zstd"]
 						}
 					}
-					row_group_size: {
+					compression_level: {
 						description: """
-							Number of rows per row group.
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
 
-							Row groups are Parquet's unit of parallelization. Larger row groups can improve
-							compression but increase memory usage during encoding. If not specified, defaults
-							to the batch size.
-							"""
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
 						required: false
-						type: uint: {
-							default: null
-							examples: [100000, 1000000]
-						}
+						type: int: examples: [3, 6, 10]
 					}
-					allow_nullable_fields: {
+					exclude_columns: {
 						description: """
-							Allow null values for non-nullable fields in the schema.
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
 
-							When enabled, missing or incompatible values will be encoded as null even for fields
-							marked as non-nullable in the schema. This is useful when working with downstream
-							systems that can handle null values through defaults or computed columns.
-							"""
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
 						required: false
-						type: bool: default: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
 					}
-					estimated_output_size: {
+					infer_schema: {
 						description: """
-							Estimated compressed output size in bytes for buffer pre-allocation.
-
-							Pre-allocating the output buffer based on expected compressed size significantly
-							reduces memory overhead by avoiding repeated reallocations during encoding.
-							If not specified, defaults to a heuristic based on estimated uncompressed size.
-
-							Guidelines for setting this value:
-							- Monitor actual compressed output sizes in production
-							- Set to ~1.2x your average observed compressed size for headroom
-							- ZSTD typically achieves 3-10x compression on JSON data
-							- Example: If batches are 100MB uncompressed and compress to 10MB, set to ~12MB
-							"""
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
 						required: false
-						type: uint: {
-							default: null
-							examples: [10485760, 52428800]
+						type: bool: {
+							default: false
+							examples: [true]
 						}
 					}
-					enable_bloom_filters: {
+					max_columns: {
 						description: """
-							Enable Bloom filters for all columns.
-
-							Bloom filters are probabilistic data structures that can significantly improve
-							query performance by allowing query engines to skip entire row groups when
-							searching for specific values. They are especially effective for:
-							- High-cardinality columns (UUIDs, user IDs, session IDs)
-							- String columns (URLs, emails, tags)
-							- Point queries (WHERE column = 'value')
-							- IN clauses (WHERE column IN (...))
-
-							Trade-offs:
-							- Pros: Faster queries, better row group pruning in engines like Athena/Spark
-							- Cons: Slightly larger file sizes (typically 1-5% overhead), minimal write overhead
-
-							When disabled (default), no Bloom filters are written.
-							"""
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
 						required: false
-						type: bool: {
-							default: false
-							examples: [true, false]
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
 						}
 					}
-					bloom_filter_fpp: {
+					row_group_size: {
 						description: """
-							False positive probability for Bloom filters.
+																Number of rows per row group
 
-							This controls the trade-off between Bloom filter size and accuracy.
-							Lower values produce larger but more accurate filters.
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
 
-							- Default: 0.05 (5% false positive rate)
-							- Range: Must be between 0.0 and 1.0 (exclusive)
-							- Recommended: 0.01 (1%) for high-selectivity queries, 0.05 (5%) for general use
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
 
-							Only takes effect when enable_bloom_filters is true.
-							"""
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
 						required: false
-						type: float: {
-							default: 0.05
-							examples: [0.05, 0.01]
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
 						}
 					}
-					bloom_filter_ndv: {
+					writer_version: {
 						description: """
-							Estimated number of distinct values for Bloom filter sizing.
+																Parquet format writer version.
 
-							This should match the expected cardinality of your columns. Higher values
-							result in larger Bloom filters. If your actual distinct value count significantly
-							exceeds this number, the false positive rate may increase.
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
 
-							- Default: 1,000,000
-							- Recommended: Set based on your data's actual cardinality
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
 
-							Only takes effect when enable_bloom_filters is true.
-							"""
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
 						required: false
-						type: uint: {
-							default: 1000000
-							examples: [1000000, 10000000]
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
 						}
 					}
 				}
@@ -878,7 +1065,6 @@ generated: components: sinks: aws_s3: configuration: {
 		required: false
 		type: string: examples: [
 			"json",
-			"parquet",
 		]
 	}
 	filename_time_format: {
diff --git a/website/cue/reference/components/sinks/generated/aws_sns.cue b/website/cue/reference/components/sinks/generated/aws_sns.cue
index 1d2413066b071..c17532a348f9f 100644
--- a/website/cue/reference/components/sinks/generated/aws_sns.cue
+++ b/website/cue/reference/components/sinks/generated/aws_sns.cue
@@ -319,6 +319,15 @@ generated: components: sinks: aws_sns: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -482,6 +491,324 @@ generated: components: sinks: aws_sns: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_sqs.cue b/website/cue/reference/components/sinks/generated/aws_sqs.cue
index 912e1f2c43ea6..0b047cae1a45b 100644
--- a/website/cue/reference/components/sinks/generated/aws_sqs.cue
+++ b/website/cue/reference/components/sinks/generated/aws_sqs.cue
@@ -319,6 +319,15 @@ generated: components: sinks: aws_sqs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -482,6 +491,324 @@ generated: components: sinks: aws_sqs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/azure_blob.cue b/website/cue/reference/components/sinks/generated/azure_blob.cue
index 69bdd368f1338..fc99f7b11486c 100644
--- a/website/cue/reference/components/sinks/generated/azure_blob.cue
+++ b/website/cue/reference/components/sinks/generated/azure_blob.cue
@@ -343,6 +343,15 @@ generated: components: sinks: azure_blob: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -506,6 +515,324 @@ generated: components: sinks: azure_blob: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/console.cue b/website/cue/reference/components/sinks/generated/console.cue
index 16545964f6950..795ea8adeb382 100644
--- a/website/cue/reference/components/sinks/generated/console.cue
+++ b/website/cue/reference/components/sinks/generated/console.cue
@@ -197,6 +197,15 @@ generated: components: sinks: console: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -360,6 +369,324 @@ generated: components: sinks: console: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/file.cue b/website/cue/reference/components/sinks/generated/file.cue
index 3135f4b70db5f..731adbd5303f1 100644
--- a/website/cue/reference/components/sinks/generated/file.cue
+++ b/website/cue/reference/components/sinks/generated/file.cue
@@ -217,6 +217,15 @@ generated: components: sinks: file: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -380,6 +389,324 @@ generated: components: sinks: file: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue b/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue
index 0a027dfa45a74..bd17da68ca422 100644
--- a/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue
+++ b/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue
@@ -285,6 +285,15 @@ generated: components: sinks: gcp_chronicle_unstructured: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -448,6 +457,324 @@ generated: components: sinks: gcp_chronicle_unstructured: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue b/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue
index 22f565c34665d..785e1879f463d 100644
--- a/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue
+++ b/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue
@@ -358,6 +358,15 @@ generated: components: sinks: gcp_cloud_storage: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -521,6 +530,324 @@ generated: components: sinks: gcp_cloud_storage: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/gcp_pubsub.cue b/website/cue/reference/components/sinks/generated/gcp_pubsub.cue
index 385bf9dd13793..e0aa0fd3faf6e 100644
--- a/website/cue/reference/components/sinks/generated/gcp_pubsub.cue
+++ b/website/cue/reference/components/sinks/generated/gcp_pubsub.cue
@@ -264,6 +264,15 @@ generated: components: sinks: gcp_pubsub: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -427,6 +436,324 @@ generated: components: sinks: gcp_pubsub: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/http.cue b/website/cue/reference/components/sinks/generated/http.cue
index cc4da5bf49425..3c41e7ebad7b3 100644
--- a/website/cue/reference/components/sinks/generated/http.cue
+++ b/website/cue/reference/components/sinks/generated/http.cue
@@ -447,6 +447,15 @@ generated: components: sinks: http: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -610,6 +619,324 @@ generated: components: sinks: http: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/humio_logs.cue b/website/cue/reference/components/sinks/generated/humio_logs.cue
index b9e46513d8b9c..9f8dc77ad41cc 100644
--- a/website/cue/reference/components/sinks/generated/humio_logs.cue
+++ b/website/cue/reference/components/sinks/generated/humio_logs.cue
@@ -263,6 +263,15 @@ generated: components: sinks: humio_logs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -426,6 +435,324 @@ generated: components: sinks: humio_logs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/kafka.cue b/website/cue/reference/components/sinks/generated/kafka.cue
index 9b3ef74b7dd76..8ca8a02297a7c 100644
--- a/website/cue/reference/components/sinks/generated/kafka.cue
+++ b/website/cue/reference/components/sinks/generated/kafka.cue
@@ -252,6 +252,15 @@ generated: components: sinks: kafka: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -415,6 +424,324 @@ generated: components: sinks: kafka: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/loki.cue b/website/cue/reference/components/sinks/generated/loki.cue
index 933394342d1bd..d9c3674435a8f 100644
--- a/website/cue/reference/components/sinks/generated/loki.cue
+++ b/website/cue/reference/components/sinks/generated/loki.cue
@@ -449,6 +449,15 @@ generated: components: sinks: loki: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -612,6 +621,324 @@ generated: components: sinks: loki: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/mqtt.cue b/website/cue/reference/components/sinks/generated/mqtt.cue
index 980996cff164d..0ad10982b42ae 100644
--- a/website/cue/reference/components/sinks/generated/mqtt.cue
+++ b/website/cue/reference/components/sinks/generated/mqtt.cue
@@ -207,6 +207,15 @@ generated: components: sinks: mqtt: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -370,6 +379,324 @@ generated: components: sinks: mqtt: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/nats.cue b/website/cue/reference/components/sinks/generated/nats.cue
index 0979492903e9c..8a0899a1db6bf 100644
--- a/website/cue/reference/components/sinks/generated/nats.cue
+++ b/website/cue/reference/components/sinks/generated/nats.cue
@@ -297,6 +297,15 @@ generated: components: sinks: nats: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -460,6 +469,324 @@ generated: components: sinks: nats: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/opentelemetry.cue b/website/cue/reference/components/sinks/generated/opentelemetry.cue
index 897288dbdfe72..e70b8ebd8faee 100644
--- a/website/cue/reference/components/sinks/generated/opentelemetry.cue
+++ b/website/cue/reference/components/sinks/generated/opentelemetry.cue
@@ -450,6 +450,15 @@ generated: components: sinks: opentelemetry: configuration: protocol: {
 
 																			[otlp]: https://opentelemetry.io/docs/specs/otlp/
 																			"""
+						parquet: """
+																			Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+																			Parquet is a columnar storage format optimized for analytics workloads.
+																			It provides efficient compression and encoding schemes, making it ideal
+																			for long-term storage and query performance.
+
+																			[apache_parquet]: https://parquet.apache.org/
+																			"""
 						protobuf: """
 																			Encodes an event as a [Protobuf][protobuf] message.
 
@@ -613,6 +622,324 @@ generated: components: sinks: opentelemetry: configuration: protocol: {
 					required:    false
 					type: array: items: type: string: {}
 				}
+				parquet: {
+					description:   "Apache Parquet-specific encoder options."
+					relevant_when: "codec = \"parquet\""
+					required:      true
+					type: object: options: {
+						allow_nullable_fields: {
+							description: """
+																				Allow null values for non-nullable fields in the schema.
+
+																				When enabled, missing or incompatible values will be encoded as null even for fields
+																				marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																				systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																				When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																				ensuring all required data is present before writing to Parquet.
+																				"""
+							required: false
+							type: bool: {
+								default: false
+								examples: [true]
+							}
+						}
+						compression: {
+							description: """
+																				Compression algorithm to use for Parquet columns
+
+																				Compression is applied to all columns in the Parquet file.
+																				Snappy provides a good balance of speed and compression ratio.
+																				"""
+							required: false
+							type: string: {
+								default: "snappy"
+								enum: {
+									brotli:       "Brotli compression"
+									gzip:         "GZIP compression (slower, better compression ratio)"
+									lz4:          "LZ4 compression (very fast, moderate compression)"
+									snappy:       "Snappy compression (fast, moderate compression ratio)"
+									uncompressed: "No compression"
+									zstd:         "ZSTD compression (good balance of speed and compression)"
+								}
+								examples: ["snappy", "gzip", "zstd"]
+							}
+						}
+						compression_level: {
+							description: """
+																				Compression level for algorithms that support it.
+
+																				Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																				**ZSTD levels** (1-22):
+																				- 1-3: Fastest, moderate compression (level 3 is default)
+																				- 4-9: Good balance of speed and compression
+																				- 10-15: Better compression, slower encoding
+																				- 16-22: Maximum compression, slowest (good for cold storage)
+
+																				**GZIP levels** (1-9):
+																				- 1-3: Faster, less compression
+																				- 6: Default balance (recommended)
+																				- 9: Maximum compression, slowest
+
+																				**Brotli levels** (0-11):
+																				- 0-4: Faster encoding
+																				- 1: Default (recommended)
+																				- 5-11: Better compression, slower
+
+																				Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																				Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																				"""
+							required: false
+							type: int: examples: [3, 6, 10]
+						}
+						exclude_columns: {
+							description: """
+																				Column names to exclude from Parquet encoding
+
+																				These columns will be completely excluded from the Parquet file.
+																				Useful for filtering out metadata, internal fields, or temporary data.
+
+																				Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																				"""
+							required: false
+							type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+						}
+						infer_schema: {
+							description: """
+																				Automatically infer schema from event data
+
+																				When enabled, the schema is inferred from each batch of events independently.
+																				The schema is determined by examining the types of values in the events.
+
+																				**Type mapping:**
+																				- String values → `utf8`
+																				- Integer values → `int64`
+																				- Float values → `float64`
+																				- Boolean values → `boolean`
+																				- Timestamp values → `timestamp_microsecond`
+																				- Arrays/Objects → `utf8` (serialized as JSON)
+
+																				**Type conflicts:** If a field has different types across events in the same batch,
+																				it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																				**Important:** Schema consistency across batches is the operator's responsibility.
+																				Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																				a different schema if event structure varies.
+
+																				**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																				Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																				"""
+							required: false
+							type: bool: {
+								default: false
+								examples: [true]
+							}
+						}
+						max_columns: {
+							description: """
+																				Maximum number of columns to encode
+
+																				Limits the number of columns in the Parquet file. Additional columns beyond
+																				this limit will be silently dropped. Columns are selected in the order they
+																				appear in the first event.
+
+																				Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																				"""
+							required: false
+							type: uint: {
+								default: 1000
+								examples: [500, 1000]
+							}
+						}
+						row_group_size: {
+							description: """
+																				Number of rows per row group
+
+																				Row groups are Parquet's unit of parallelization. Larger row groups
+																				can improve compression but increase memory usage during encoding.
+
+																				Since each batch becomes a separate Parquet file, this value
+																				should be <= the batch max_events setting. Row groups cannot span multiple files.
+																				If not specified, defaults to the batch size.
+																				"""
+							required: false
+							type: uint: examples: [100000, 1000000]
+						}
+						schema: {
+							description: """
+																				The Arrow schema definition to use for encoding
+
+																				This schema defines the structure and types of the Parquet file columns.
+																				Specified as a map of field names to data types.
+
+																				Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																				Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																				float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																				timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																				"""
+							required: false
+							type: object: {
+								examples: [{
+									id: {
+										bloom_filter:     false
+										bloom_filter_fpp: null
+										bloom_filter_ndv: null
+										type:             "int64"
+									}
+									name: {
+										bloom_filter:     true
+										bloom_filter_fpp: 0.01
+										bloom_filter_ndv: 1000000
+										type:             "utf8"
+									}
+									timestamp: {
+										bloom_filter:     false
+										bloom_filter_fpp: null
+										bloom_filter_ndv: null
+										type:             "timestamp_microsecond"
+									}
+								}]
+								options: "*": {
+									description: "A field definition specifying the data type and optional Bloom filter configuration."
+									required:    true
+									type: object: options: {
+										bloom_filter: {
+											description: """
+																												Enable Bloom filter for this specific column
+
+																												When enabled, a Bloom filter will be created for this column to improve
+																												query performance for point lookups and IN clauses. Only enable for
+																												high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																												"""
+											required: false
+											type: bool: {
+												default: false
+												examples: [true]
+											}
+										}
+										bloom_filter_fpp: {
+											description: """
+																												False positive probability for this column's Bloom filter
+
+																												Lower values create larger but more accurate filters.
+
+																												- 0.05 (5%): Good balance for general use
+																												- 0.01 (1%): Better for high-selectivity queries
+																												"""
+											required: false
+											type: float: examples: [0.05, 0.01]
+										}
+										bloom_filter_ndv: {
+											description: """
+																												Number of distinct values expected for this column's Bloom filter
+
+																												This controls the size of the Bloom filter. Should match the actual
+																												cardinality of the column. Will be automatically capped to the batch size.
+
+																												- Low cardinality (countries, states): 1,000 - 100,000
+																												- Medium cardinality (cities, products): 100,000 - 1,000,000
+																												- High cardinality (UUIDs, user IDs): 10,000,000+
+																												"""
+											required: false
+											type: uint: examples: [1000000, 10000000]
+										}
+										type: {
+											description: "Data type for this field"
+											required:    true
+											type: string: examples: ["utf8", "int64", "timestamp_ms"]
+										}
+									}
+								}
+							}
+						}
+						sorting_columns: {
+							description: """
+																				Sorting order for rows within row groups.
+
+																				Pre-sorting rows by specified columns before writing can significantly improve both
+																				compression ratios and query performance. This is especially valuable for time-series
+																				data and event logs.
+
+																				**Benefits:**
+																				- **Better compression** (20-40% smaller files): Similar values are grouped together
+																				- **Faster queries**: More effective min/max statistics enable better row group skipping
+																				- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																				**Common patterns:**
+																				- Time-series: Sort by timestamp descending (most recent first)
+																				- Multi-tenant: Sort by tenant_id, then timestamp
+																				- User analytics: Sort by user_id, then event_time
+
+																				**Trade-offs:**
+																				- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																				- Requires buffering entire batch in memory for sorting
+																				- Most beneficial when queries frequently filter on sorted columns
+
+																				**Example:**
+																				```yaml
+																				sorting_columns:
+																				  - column: timestamp
+																				    descending: true
+																				  - column: user_id
+																				    descending: false
+																				```
+
+																				If not specified, rows are written in the order they appear in the batch.
+																				"""
+							required: false
+							type: array: items: type: object: options: {
+								column: {
+									description: "Name of the column to sort by"
+									required:    true
+									type: string: examples: ["timestamp", "user_id"]
+								}
+								descending: {
+									description: """
+																											Sort in descending order (true) or ascending order (false)
+
+																											- `true`: Descending (Z-A, 9-0, newest-oldest)
+																											- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																											"""
+									required: false
+									type: bool: {
+										default: false
+										examples: [true]
+									}
+								}
+							}
+						}
+						writer_version: {
+							description: """
+																				Parquet format writer version.
+
+																				Controls which Parquet format version to write:
+																				- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																				- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																				Version 2 benefits:
+																				- More efficient encoding for certain data types (10-20% smaller files)
+																				- Better statistics for query optimization
+																				- Improved data page format
+																				- Required for some advanced features
+
+																				Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																				Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																				"""
+							required: false
+							type: string: {
+								default: "v2"
+								enum: {
+									v1: "Parquet format version 1.0 (maximum compatibility)"
+									v2: "Parquet format version 2.0 (modern format with better encoding)"
+								}
+								examples: ["v1", "v2"]
+							}
+						}
+					}
+				}
 				protobuf: {
 					description:   "Options for the Protobuf serializer."
 					relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/papertrail.cue b/website/cue/reference/components/sinks/generated/papertrail.cue
index b69042f48b2ff..0b1ed7cc43906 100644
--- a/website/cue/reference/components/sinks/generated/papertrail.cue
+++ b/website/cue/reference/components/sinks/generated/papertrail.cue
@@ -197,6 +197,15 @@ generated: components: sinks: papertrail: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -360,6 +369,324 @@ generated: components: sinks: papertrail: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/pulsar.cue b/website/cue/reference/components/sinks/generated/pulsar.cue
index cc2cb6c90cb3a..bc71f124b89be 100644
--- a/website/cue/reference/components/sinks/generated/pulsar.cue
+++ b/website/cue/reference/components/sinks/generated/pulsar.cue
@@ -331,6 +331,15 @@ generated: components: sinks: pulsar: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -494,6 +503,324 @@ generated: components: sinks: pulsar: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/redis.cue b/website/cue/reference/components/sinks/generated/redis.cue
index fdd5686420da8..a3d98de388e65 100644
--- a/website/cue/reference/components/sinks/generated/redis.cue
+++ b/website/cue/reference/components/sinks/generated/redis.cue
@@ -256,6 +256,15 @@ generated: components: sinks: redis: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -419,6 +428,324 @@ generated: components: sinks: redis: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/socket.cue b/website/cue/reference/components/sinks/generated/socket.cue
index 17d6e8cf7d4d0..11dc80283e144 100644
--- a/website/cue/reference/components/sinks/generated/socket.cue
+++ b/website/cue/reference/components/sinks/generated/socket.cue
@@ -209,6 +209,15 @@ generated: components: sinks: socket: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -372,6 +381,324 @@ generated: components: sinks: socket: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue b/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue
index ce87f34e9ff83..a799b603b8d87 100644
--- a/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue
+++ b/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue
@@ -313,6 +313,15 @@ generated: components: sinks: splunk_hec_logs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -476,6 +485,324 @@ generated: components: sinks: splunk_hec_logs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/webhdfs.cue b/website/cue/reference/components/sinks/generated/webhdfs.cue
index 236bd9f491f33..56630d46f67e3 100644
--- a/website/cue/reference/components/sinks/generated/webhdfs.cue
+++ b/website/cue/reference/components/sinks/generated/webhdfs.cue
@@ -263,6 +263,15 @@ generated: components: sinks: webhdfs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -426,6 +435,324 @@ generated: components: sinks: webhdfs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/websocket.cue b/website/cue/reference/components/sinks/generated/websocket.cue
index 146bba2dada78..f0d1f4264fca9 100644
--- a/website/cue/reference/components/sinks/generated/websocket.cue
+++ b/website/cue/reference/components/sinks/generated/websocket.cue
@@ -376,6 +376,15 @@ generated: components: sinks: websocket: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -539,6 +548,324 @@ generated: components: sinks: websocket: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/websocket_server.cue b/website/cue/reference/components/sinks/generated/websocket_server.cue
index f822ab29b8626..eac5ac077c802 100644
--- a/website/cue/reference/components/sinks/generated/websocket_server.cue
+++ b/website/cue/reference/components/sinks/generated/websocket_server.cue
@@ -253,6 +253,15 @@ generated: components: sinks: websocket_server: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
+					parquet: """
+						Encodes events in [Apache Parquet][apache_parquet] columnar format.
+
+						Parquet is a columnar storage format optimized for analytics workloads.
+						It provides efficient compression and encoding schemes, making it ideal
+						for long-term storage and query performance.
+
+						[apache_parquet]: https://parquet.apache.org/
+						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -416,6 +425,324 @@ generated: components: sinks: websocket_server: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
+			parquet: {
+				description:   "Apache Parquet-specific encoder options."
+				relevant_when: "codec = \"parquet\""
+				required:      true
+				type: object: options: {
+					allow_nullable_fields: {
+						description: """
+																Allow null values for non-nullable fields in the schema.
+
+																When enabled, missing or incompatible values will be encoded as null even for fields
+																marked as non-nullable in the Arrow schema. This is useful when working with downstream
+																systems that can handle null values through defaults, computed columns, or other mechanisms.
+
+																When disabled (default), missing values for non-nullable fields will cause encoding errors,
+																ensuring all required data is present before writing to Parquet.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					compression: {
+						description: """
+																Compression algorithm to use for Parquet columns
+
+																Compression is applied to all columns in the Parquet file.
+																Snappy provides a good balance of speed and compression ratio.
+																"""
+						required: false
+						type: string: {
+							default: "snappy"
+							enum: {
+								brotli:       "Brotli compression"
+								gzip:         "GZIP compression (slower, better compression ratio)"
+								lz4:          "LZ4 compression (very fast, moderate compression)"
+								snappy:       "Snappy compression (fast, moderate compression ratio)"
+								uncompressed: "No compression"
+								zstd:         "ZSTD compression (good balance of speed and compression)"
+							}
+							examples: ["snappy", "gzip", "zstd"]
+						}
+					}
+					compression_level: {
+						description: """
+																Compression level for algorithms that support it.
+
+																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
+
+																**ZSTD levels** (1-22):
+																- 1-3: Fastest, moderate compression (level 3 is default)
+																- 4-9: Good balance of speed and compression
+																- 10-15: Better compression, slower encoding
+																- 16-22: Maximum compression, slowest (good for cold storage)
+
+																**GZIP levels** (1-9):
+																- 1-3: Faster, less compression
+																- 6: Default balance (recommended)
+																- 9: Maximum compression, slowest
+
+																**Brotli levels** (0-11):
+																- 0-4: Faster encoding
+																- 1: Default (recommended)
+																- 5-11: Better compression, slower
+
+																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
+																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
+																"""
+						required: false
+						type: int: examples: [3, 6, 10]
+					}
+					exclude_columns: {
+						description: """
+																Column names to exclude from Parquet encoding
+
+																These columns will be completely excluded from the Parquet file.
+																Useful for filtering out metadata, internal fields, or temporary data.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
+					}
+					infer_schema: {
+						description: """
+																Automatically infer schema from event data
+
+																When enabled, the schema is inferred from each batch of events independently.
+																The schema is determined by examining the types of values in the events.
+
+																**Type mapping:**
+																- String values → `utf8`
+																- Integer values → `int64`
+																- Float values → `float64`
+																- Boolean values → `boolean`
+																- Timestamp values → `timestamp_microsecond`
+																- Arrays/Objects → `utf8` (serialized as JSON)
+
+																**Type conflicts:** If a field has different types across events in the same batch,
+																it will be encoded as `utf8` (string) and all values will be converted to strings.
+
+																**Important:** Schema consistency across batches is the operator's responsibility.
+																Use VRL transforms to ensure consistent types if needed. Each batch may produce
+																a different schema if event structure varies.
+
+																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
+
+																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
+																"""
+						required: false
+						type: bool: {
+							default: false
+							examples: [true]
+						}
+					}
+					max_columns: {
+						description: """
+																Maximum number of columns to encode
+
+																Limits the number of columns in the Parquet file. Additional columns beyond
+																this limit will be silently dropped. Columns are selected in the order they
+																appear in the first event.
+
+																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
+																"""
+						required: false
+						type: uint: {
+							default: 1000
+							examples: [500, 1000]
+						}
+					}
+					row_group_size: {
+						description: """
+																Number of rows per row group
+
+																Row groups are Parquet's unit of parallelization. Larger row groups
+																can improve compression but increase memory usage during encoding.
+
+																Since each batch becomes a separate Parquet file, this value
+																should be <= the batch max_events setting. Row groups cannot span multiple files.
+																If not specified, defaults to the batch size.
+																"""
+						required: false
+						type: uint: examples: [100000, 1000000]
+					}
+					schema: {
+						description: """
+																The Arrow schema definition to use for encoding
+
+																This schema defines the structure and types of the Parquet file columns.
+																Specified as a map of field names to data types.
+
+																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
+
+																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
+																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
+																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
+																"""
+						required: false
+						type: object: {
+							examples: [{
+								id: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "int64"
+								}
+								name: {
+									bloom_filter:     true
+									bloom_filter_fpp: 0.01
+									bloom_filter_ndv: 1000000
+									type:             "utf8"
+								}
+								timestamp: {
+									bloom_filter:     false
+									bloom_filter_fpp: null
+									bloom_filter_ndv: null
+									type:             "timestamp_microsecond"
+								}
+							}]
+							options: "*": {
+								description: "A field definition specifying the data type and optional Bloom filter configuration."
+								required:    true
+								type: object: options: {
+									bloom_filter: {
+										description: """
+																								Enable Bloom filter for this specific column
+
+																								When enabled, a Bloom filter will be created for this column to improve
+																								query performance for point lookups and IN clauses. Only enable for
+																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
+																								"""
+										required: false
+										type: bool: {
+											default: false
+											examples: [true]
+										}
+									}
+									bloom_filter_fpp: {
+										description: """
+																								False positive probability for this column's Bloom filter
+
+																								Lower values create larger but more accurate filters.
+
+																								- 0.05 (5%): Good balance for general use
+																								- 0.01 (1%): Better for high-selectivity queries
+																								"""
+										required: false
+										type: float: examples: [0.05, 0.01]
+									}
+									bloom_filter_ndv: {
+										description: """
+																								Number of distinct values expected for this column's Bloom filter
+
+																								This controls the size of the Bloom filter. Should match the actual
+																								cardinality of the column. Will be automatically capped to the batch size.
+
+																								- Low cardinality (countries, states): 1,000 - 100,000
+																								- Medium cardinality (cities, products): 100,000 - 1,000,000
+																								- High cardinality (UUIDs, user IDs): 10,000,000+
+																								"""
+										required: false
+										type: uint: examples: [1000000, 10000000]
+									}
+									type: {
+										description: "Data type for this field"
+										required:    true
+										type: string: examples: ["utf8", "int64", "timestamp_ms"]
+									}
+								}
+							}
+						}
+					}
+					sorting_columns: {
+						description: """
+																Sorting order for rows within row groups.
+
+																Pre-sorting rows by specified columns before writing can significantly improve both
+																compression ratios and query performance. This is especially valuable for time-series
+																data and event logs.
+
+																**Benefits:**
+																- **Better compression** (20-40% smaller files): Similar values are grouped together
+																- **Faster queries**: More effective min/max statistics enable better row group skipping
+																- **Improved caching**: Query engines can more efficiently cache sorted data
+
+																**Common patterns:**
+																- Time-series: Sort by timestamp descending (most recent first)
+																- Multi-tenant: Sort by tenant_id, then timestamp
+																- User analytics: Sort by user_id, then event_time
+
+																**Trade-offs:**
+																- Adds sorting overhead during encoding (typically 10-30% slower writes)
+																- Requires buffering entire batch in memory for sorting
+																- Most beneficial when queries frequently filter on sorted columns
+
+																**Example:**
+																```yaml
+																sorting_columns:
+																  - column: timestamp
+																    descending: true
+																  - column: user_id
+																    descending: false
+																```
+
+																If not specified, rows are written in the order they appear in the batch.
+																"""
+						required: false
+						type: array: items: type: object: options: {
+							column: {
+								description: "Name of the column to sort by"
+								required:    true
+								type: string: examples: ["timestamp", "user_id"]
+							}
+							descending: {
+								description: """
+																							Sort in descending order (true) or ascending order (false)
+
+																							- `true`: Descending (Z-A, 9-0, newest-oldest)
+																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
+																							"""
+								required: false
+								type: bool: {
+									default: false
+									examples: [true]
+								}
+							}
+						}
+					}
+					writer_version: {
+						description: """
+																Parquet format writer version.
+
+																Controls which Parquet format version to write:
+																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
+																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
+
+																Version 2 benefits:
+																- More efficient encoding for certain data types (10-20% smaller files)
+																- Better statistics for query optimization
+																- Improved data page format
+																- Required for some advanced features
+
+																Use v1 for maximum compatibility with older readers (pre-2018 tools).
+																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
+																"""
+						required: false
+						type: string: {
+							default: "v2"
+							enum: {
+								v1: "Parquet format version 1.0 (maximum compatibility)"
+								v2: "Parquet format version 2.0 (modern format with better encoding)"
+							}
+							examples: ["v1", "v2"]
+						}
+					}
+				}
+			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""

From f9178c20b4742a4179b7899c2ef54188bbd5e0fd Mon Sep 17 00:00:00 2001
From: Thomas <thomas.schneider@datadoghq.com>
Date: Mon, 22 Dec 2025 16:02:06 -0500
Subject: [PATCH 08/13] Add exceptions to spelling/expect.txt

---
 .github/actions/spelling/expect.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index 1d68a6978570b..04296aa8cb035 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -216,6 +216,7 @@ fns
 foobarfoobarfoo
 footgunning
 Forcepoint
+fpp
 freelist
 fuzzcheck
 GC'ing
@@ -380,6 +381,7 @@ myvalue
 Namazu
 nats
 ndjson
+ndv
 nearline
 nextest
 ngx

From a0ca0bf4e1df0af3cc0bc74dca7ad0ff57c9c68c Mon Sep 17 00:00:00 2001
From: Thomas <thomas.schneider@datadoghq.com>
Date: Mon, 22 Dec 2025 16:15:28 -0500
Subject: [PATCH 09/13] cargo fmt

---
 lib/codecs/src/encoding/format/mod.rs         |   4 +-
 lib/codecs/src/encoding/format/parquet.rs     | 100 ++++++++++--------
 .../src/encoding/format/schema_definition.rs  |  16 ++-
 lib/codecs/src/encoding/mod.rs                |   6 +-
 lib/codecs/src/encoding/serializer.rs         |  52 ++++-----
 src/codecs/encoding/config.rs                 |   5 +-
 6 files changed, 95 insertions(+), 88 deletions(-)

diff --git a/lib/codecs/src/encoding/format/mod.rs b/lib/codecs/src/encoding/format/mod.rs
index a042aeaf4321c..5953ec29dc1a7 100644
--- a/lib/codecs/src/encoding/format/mod.rs
+++ b/lib/codecs/src/encoding/format/mod.rs
@@ -41,7 +41,9 @@ pub use native_json::{NativeJsonSerializer, NativeJsonSerializerConfig};
 #[cfg(feature = "opentelemetry")]
 pub use otlp::{OtlpSerializer, OtlpSerializerConfig};
 #[cfg(feature = "parquet")]
-pub use parquet::{ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig};
+pub use parquet::{
+    ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig,
+};
 pub use protobuf::{ProtobufSerializer, ProtobufSerializerConfig, ProtobufSerializerOptions};
 pub use raw_message::{RawMessageSerializer, RawMessageSerializerConfig};
 #[cfg(any(feature = "arrow", feature = "parquet"))]
diff --git a/lib/codecs/src/encoding/format/parquet.rs b/lib/codecs/src/encoding/format/parquet.rs
index f487d0a0f583d..edea26c5dac19 100644
--- a/lib/codecs/src/encoding/format/parquet.rs
+++ b/lib/codecs/src/encoding/format/parquet.rs
@@ -5,10 +5,10 @@
 //! suitable for long-term storage and analytics workloads.
 
 use arrow::datatypes::Schema;
-use bytes::{Bytes, BytesMut, BufMut};
+use bytes::{BufMut, Bytes, BytesMut};
 use parquet::{
     arrow::ArrowWriter,
-    basic::{Compression, ZstdLevel, GzipLevel, BrotliLevel},
+    basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel},
     file::properties::{WriterProperties, WriterVersion},
     schema::types::ColumnPath,
 };
@@ -19,7 +19,7 @@ use vector_config::configurable_component;
 use vector_core::event::Event;
 
 // Reuse the Arrow encoder's record batch building logic
-use super::arrow::{build_record_batch, ArrowEncodingError};
+use super::arrow::{ArrowEncodingError, build_record_batch};
 use super::schema_definition::SchemaDefinition;
 
 /// Compression algorithm for Parquet files
@@ -49,23 +49,17 @@ impl ParquetCompression {
             (ParquetCompression::Uncompressed, _) => Ok(Compression::UNCOMPRESSED),
             (ParquetCompression::Snappy, _) => Ok(Compression::SNAPPY),
             (ParquetCompression::Lz4, _) => Ok(Compression::LZ4),
-            (ParquetCompression::Gzip, Some(lvl)) => {
-                GzipLevel::try_new(lvl as u32)
-                    .map(Compression::GZIP)
-                    .map_err(|e| format!("Invalid GZIP compression level: {}", e))
-            }
+            (ParquetCompression::Gzip, Some(lvl)) => GzipLevel::try_new(lvl as u32)
+                .map(Compression::GZIP)
+                .map_err(|e| format!("Invalid GZIP compression level: {}", e)),
             (ParquetCompression::Gzip, None) => Ok(Compression::GZIP(Default::default())),
-            (ParquetCompression::Brotli, Some(lvl)) => {
-                BrotliLevel::try_new(lvl as u32)
-                    .map(Compression::BROTLI)
-                    .map_err(|e| format!("Invalid Brotli compression level: {}", e))
-            }
+            (ParquetCompression::Brotli, Some(lvl)) => BrotliLevel::try_new(lvl as u32)
+                .map(Compression::BROTLI)
+                .map_err(|e| format!("Invalid Brotli compression level: {}", e)),
             (ParquetCompression::Brotli, None) => Ok(Compression::BROTLI(Default::default())),
-            (ParquetCompression::Zstd, Some(lvl)) => {
-                ZstdLevel::try_new(lvl)
-                    .map(Compression::ZSTD)
-                    .map_err(|e| format!("Invalid ZSTD compression level: {}", e))
-            }
+            (ParquetCompression::Zstd, Some(lvl)) => ZstdLevel::try_new(lvl)
+                .map(Compression::ZSTD)
+                .map_err(|e| format!("Invalid ZSTD compression level: {}", e)),
             (ParquetCompression::Zstd, None) => Ok(Compression::ZSTD(ZstdLevel::default())),
         }
     }
@@ -73,7 +67,9 @@ impl ParquetCompression {
 
 impl From<ParquetCompression> for Compression {
     fn from(compression: ParquetCompression) -> Self {
-        compression.to_compression(None).expect("Default compression should always be valid")
+        compression
+            .to_compression(None)
+            .expect("Default compression should always be valid")
     }
 }
 
@@ -150,7 +146,9 @@ pub struct ParquetSerializerConfig {
     ///
     /// Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
     #[serde(default)]
-    #[configurable(metadata(docs::examples = "vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"))]
+    #[configurable(metadata(
+        docs::examples = "vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"
+    ))]
     pub exclude_columns: Option<Vec<String>>,
 
     /// Maximum number of columns to encode
@@ -305,8 +303,8 @@ fn default_max_columns() -> usize {
 }
 
 fn schema_example() -> SchemaDefinition {
-    use std::collections::BTreeMap;
     use super::schema_definition::FieldDefinition;
+    use std::collections::BTreeMap;
 
     let mut fields = BTreeMap::new();
     fields.insert(
@@ -322,7 +320,7 @@ fn schema_example() -> SchemaDefinition {
         "name".to_string(),
         FieldDefinition {
             r#type: "utf8".to_string(),
-            bloom_filter: true,  // Example: enable for high-cardinality string field
+            bloom_filter: true, // Example: enable for high-cardinality string field
             bloom_filter_num_distinct_values: Some(1_000_000),
             bloom_filter_false_positive_pct: Some(0.01),
         },
@@ -377,9 +375,13 @@ impl ParquetSerializerConfig {
     fn validate(&self) -> Result<(), String> {
         // Must specify exactly one schema method
         match (self.schema.is_some(), self.infer_schema) {
-            (true, true) => Err("Cannot use both 'schema' and 'infer_schema: true'. Choose one.".to_string()),
-            (false, false) => Err("Must specify either 'schema' or 'infer_schema: true'".to_string()),
-            _ => Ok(())
+            (true, true) => {
+                Err("Cannot use both 'schema' and 'infer_schema: true'. Choose one.".to_string())
+            }
+            (false, false) => {
+                Err("Must specify either 'schema' or 'infer_schema: true'".to_string())
+            }
+            _ => Ok(()),
         }
     }
 
@@ -398,9 +400,7 @@ impl ParquetSerializerConfig {
 #[derive(Clone, Debug)]
 enum SchemaMode {
     /// Use pre-defined explicit schema
-    Explicit {
-        schema: Arc<Schema>,
-    },
+    Explicit { schema: Arc<Schema> },
     /// Infer schema from each batch
     Inferred {
         exclude_columns: std::collections::BTreeSet<String>,
@@ -419,7 +419,8 @@ impl ParquetSerializer {
     /// Create a new ParquetSerializer with the given configuration
     pub fn new(config: ParquetSerializerConfig) -> Result<Self, vector_common::Error> {
         // Validate configuration
-        config.validate()
+        config
+            .validate()
             .map_err(|e| vector_common::Error::from(e))?;
 
         // Keep a copy of schema_def for later use with Bloom filters
@@ -428,7 +429,8 @@ impl ParquetSerializer {
         // Determine schema mode
         let schema_mode = if config.infer_schema {
             SchemaMode::Inferred {
-                exclude_columns: config.exclude_columns
+                exclude_columns: config
+                    .exclude_columns
                     .unwrap_or_default()
                     .into_iter()
                     .collect(),
@@ -460,7 +462,9 @@ impl ParquetSerializer {
         };
 
         // Build writer properties
-        let compression = config.compression.to_compression(config.compression_level)
+        let compression = config
+            .compression
+            .to_compression(config.compression_level)
             .map_err(|e| vector_common::Error::from(e))?;
 
         tracing::debug!(
@@ -480,8 +484,8 @@ impl ParquetSerializer {
         }
 
         // Only apply Bloom filters and sorting for explicit schema mode
-        if let (SchemaMode::Explicit { schema }, Some(schema_def)) = (&schema_mode, &schema_def_opt) {
-
+        if let (SchemaMode::Explicit { schema }, Some(schema_def)) = (&schema_mode, &schema_def_opt)
+        {
             // Apply per-column Bloom filter settings from schema
             let bloom_filter_configs = schema_def.extract_bloom_filter_configs();
             for bloom_config in bloom_filter_configs {
@@ -594,7 +598,9 @@ pub enum ParquetEncodingError {
     NoSchemaProvided,
 
     /// No fields could be inferred from events
-    #[snafu(display("No fields could be inferred from events (all fields excluded or only null values)"))]
+    #[snafu(display(
+        "No fields could be inferred from events (all fields excluded or only null values)"
+    ))]
     NoFieldsInferred,
 
     /// Invalid event type (not a log event)
@@ -642,8 +648,8 @@ impl From<serde_json::Error> for ParquetEncodingError {
 
 /// Infer Arrow DataType from a Vector Value
 fn infer_arrow_type(value: &vector_core::event::Value) -> arrow::datatypes::DataType {
-    use vector_core::event::Value;
     use arrow::datatypes::{DataType, TimeUnit};
+    use vector_core::event::Value;
 
     match value {
         Value::Bytes(_) => DataType::Utf8,
@@ -664,8 +670,8 @@ fn infer_schema_from_events(
     exclude_columns: &std::collections::BTreeSet<String>,
     max_columns: usize,
 ) -> Result<Arc<Schema>, ParquetEncodingError> {
-    use std::collections::BTreeMap;
     use arrow::datatypes::{DataType, Field};
+    use std::collections::BTreeMap;
     use vector_core::event::Value;
 
     let mut field_types: BTreeMap<String, DataType> = BTreeMap::new();
@@ -678,7 +684,9 @@ fn infer_schema_from_events(
             _ => return Err(ParquetEncodingError::InvalidEventType),
         };
 
-        let fields_iter = log.all_event_fields().ok_or(ParquetEncodingError::InvalidEventType)?;
+        let fields_iter = log
+            .all_event_fields()
+            .ok_or(ParquetEncodingError::InvalidEventType)?;
 
         for (key, value) in fields_iter {
             let key_str = key.to_string();
@@ -765,11 +773,8 @@ pub fn encode_events_to_parquet(
     // Write RecordBatch to Parquet format in memory
     let mut buffer = Vec::new();
     {
-        let mut writer = ArrowWriter::try_new(
-            &mut buffer,
-            batch_schema,
-            Some(writer_properties.clone()),
-        )?;
+        let mut writer =
+            ArrowWriter::try_new(&mut buffer, batch_schema, Some(writer_properties.clone()))?;
 
         writer.write(&record_batch)?;
         writer.close()?;
@@ -968,7 +973,10 @@ mod tests {
         let props = WriterProperties::builder().build();
         let result = encode_events_to_parquet(&events, schema, &props, None);
         assert!(result.is_err());
-        assert!(matches!(result.unwrap_err(), ParquetEncodingError::NoEvents));
+        assert!(matches!(
+            result.unwrap_err(),
+            ParquetEncodingError::NoEvents
+        ));
     }
 
     #[test]
@@ -1013,8 +1021,8 @@ mod tests {
 
     #[test]
     fn test_parquet_serializer_config() {
-        use std::collections::BTreeMap;
         use super::schema_definition::FieldDefinition;
+        use std::collections::BTreeMap;
 
         let mut fields = BTreeMap::new();
         fields.insert(
@@ -1065,9 +1073,9 @@ mod tests {
 
     #[test]
     fn test_encoder_trait_implementation() {
+        use super::schema_definition::FieldDefinition;
         use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
-        use super::schema_definition::FieldDefinition;
 
         let mut fields = BTreeMap::new();
         fields.insert(
@@ -1147,9 +1155,9 @@ mod tests {
 
     #[test]
     fn test_allow_nullable_fields_config() {
+        use super::schema_definition::FieldDefinition;
         use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
-        use super::schema_definition::FieldDefinition;
 
         let mut fields = BTreeMap::new();
         fields.insert(
diff --git a/lib/codecs/src/encoding/format/schema_definition.rs b/lib/codecs/src/encoding/format/schema_definition.rs
index a1f66aa4d43fe..1ed2c05b1ff31 100644
--- a/lib/codecs/src/encoding/format/schema_definition.rs
+++ b/lib/codecs/src/encoding/format/schema_definition.rs
@@ -84,7 +84,9 @@ pub struct ColumnBloomFilterConfig {
 pub struct SchemaDefinition {
     /// Map of field names to their type and Bloom filter configuration
     #[serde(flatten)]
-    #[configurable(metadata(docs::additional_props_description = "A field definition specifying the data type and optional Bloom filter configuration."))]
+    #[configurable(metadata(
+        docs::additional_props_description = "A field definition specifying the data type and optional Bloom filter configuration."
+    ))]
     pub fields: BTreeMap<String, FieldDefinition>,
 }
 
@@ -125,10 +127,7 @@ impl SchemaDefinition {
 }
 
 /// Parse a data type string into an Arrow DataType
-fn parse_data_type(
-    type_str: &str,
-    field_name: &str,
-) -> Result<DataType, SchemaDefinitionError> {
+fn parse_data_type(type_str: &str, field_name: &str) -> Result<DataType, SchemaDefinitionError> {
     let data_type = match type_str.to_lowercase().as_str() {
         // String types
         "utf8" | "string" => DataType::Utf8,
@@ -158,9 +157,7 @@ fn parse_data_type(
         "large_binary" => DataType::LargeBinary,
 
         // Timestamp types
-        "timestamp_second" | "timestamp_s" => {
-            DataType::Timestamp(TimeUnit::Second, None)
-        }
+        "timestamp_second" | "timestamp_s" => DataType::Timestamp(TimeUnit::Second, None),
         "timestamp_millisecond" | "timestamp_ms" | "timestamp_millis" => {
             DataType::Timestamp(TimeUnit::Millisecond, None)
         }
@@ -196,7 +193,7 @@ fn parse_data_type(
             return Err(SchemaDefinitionError::UnknownDataType {
                 field_name: field_name.to_string(),
                 data_type: type_str.to_string(),
-            })
+            });
         }
     };
 
@@ -390,5 +387,4 @@ mod tests {
         assert_eq!(request_id_config.ndv, None);
         assert_eq!(request_id_config.fpp, None);
     }
-
 }
diff --git a/lib/codecs/src/encoding/mod.rs b/lib/codecs/src/encoding/mod.rs
index 8d58e0b7ff98d..daa2a72134a2d 100644
--- a/lib/codecs/src/encoding/mod.rs
+++ b/lib/codecs/src/encoding/mod.rs
@@ -19,10 +19,12 @@ pub use format::{
     ProtobufSerializerOptions, RawMessageSerializer, RawMessageSerializerConfig, TextSerializer,
     TextSerializerConfig,
 };
-#[cfg(feature = "parquet")]
-pub use format::{ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig};
 #[cfg(feature = "opentelemetry")]
 pub use format::{OtlpSerializer, OtlpSerializerConfig};
+#[cfg(feature = "parquet")]
+pub use format::{
+    ParquetCompression, ParquetEncodingError, ParquetSerializer, ParquetSerializerConfig,
+};
 #[cfg(any(feature = "arrow", feature = "parquet"))]
 pub use format::{SchemaDefinition, SchemaDefinitionError};
 pub use framing::{
diff --git a/lib/codecs/src/encoding/serializer.rs b/lib/codecs/src/encoding/serializer.rs
index 4d1d50d186846..4bf0a7c682c06 100644
--- a/lib/codecs/src/encoding/serializer.rs
+++ b/lib/codecs/src/encoding/serializer.rs
@@ -8,10 +8,10 @@ use vector_core::{config::DataType, event::Event, schema};
 
 #[cfg(feature = "arrow")]
 use super::format::ArrowStreamSerializerConfig;
-#[cfg(feature = "opentelemetry")]
-use super::format::{OtlpSerializer, OtlpSerializerConfig};
 #[cfg(feature = "parquet")]
 use super::format::ParquetSerializerConfig;
+#[cfg(feature = "opentelemetry")]
+use super::format::{OtlpSerializer, OtlpSerializerConfig};
 use super::{
     chunking::Chunker,
     format::{
@@ -364,17 +364,17 @@ impl SerializerConfig {
             SerializerConfig::Gelf(config) => config.input_type(),
             SerializerConfig::Json(config) => config.input_type(),
             SerializerConfig::Logfmt => LogfmtSerializerConfig.input_type(),
-        SerializerConfig::Native => NativeSerializerConfig.input_type(),
-        SerializerConfig::NativeJson => NativeJsonSerializerConfig.input_type(),
-        #[cfg(feature = "opentelemetry")]
-        SerializerConfig::Otlp => OtlpSerializerConfig::default().input_type(),
-        SerializerConfig::Protobuf(config) => config.input_type(),
-        #[cfg(feature = "parquet")]
-        SerializerConfig::Parquet { parquet } => parquet.input_type(),
-        SerializerConfig::RawMessage => RawMessageSerializerConfig.input_type(),
-        SerializerConfig::Text(config) => config.input_type(),
+            SerializerConfig::Native => NativeSerializerConfig.input_type(),
+            SerializerConfig::NativeJson => NativeJsonSerializerConfig.input_type(),
+            #[cfg(feature = "opentelemetry")]
+            SerializerConfig::Otlp => OtlpSerializerConfig::default().input_type(),
+            SerializerConfig::Protobuf(config) => config.input_type(),
+            #[cfg(feature = "parquet")]
+            SerializerConfig::Parquet { parquet } => parquet.input_type(),
+            SerializerConfig::RawMessage => RawMessageSerializerConfig.input_type(),
+            SerializerConfig::Text(config) => config.input_type(),
+        }
     }
-}
 
     /// The schema required by the serializer.
     pub fn schema_requirement(&self) -> schema::Requirement {
@@ -388,17 +388,17 @@ impl SerializerConfig {
             SerializerConfig::Json(config) => config.schema_requirement(),
             SerializerConfig::Logfmt => LogfmtSerializerConfig.schema_requirement(),
             SerializerConfig::Native => NativeSerializerConfig.schema_requirement(),
-        SerializerConfig::NativeJson => NativeJsonSerializerConfig.schema_requirement(),
-        #[cfg(feature = "opentelemetry")]
-        SerializerConfig::Otlp => OtlpSerializerConfig::default().schema_requirement(),
-        SerializerConfig::Protobuf(config) => config.schema_requirement(),
-        #[cfg(feature = "parquet")]
-        SerializerConfig::Parquet { parquet } => parquet.schema_requirement(),
-        SerializerConfig::RawMessage => RawMessageSerializerConfig.schema_requirement(),
-        SerializerConfig::Text(config) => config.schema_requirement(),
+            SerializerConfig::NativeJson => NativeJsonSerializerConfig.schema_requirement(),
+            #[cfg(feature = "opentelemetry")]
+            SerializerConfig::Otlp => OtlpSerializerConfig::default().schema_requirement(),
+            SerializerConfig::Protobuf(config) => config.schema_requirement(),
+            #[cfg(feature = "parquet")]
+            SerializerConfig::Parquet { parquet } => parquet.schema_requirement(),
+            SerializerConfig::RawMessage => RawMessageSerializerConfig.schema_requirement(),
+            SerializerConfig::Text(config) => config.schema_requirement(),
+        }
     }
 }
-}
 
 /// Serialize structured events as bytes.
 #[derive(Debug, Clone)]
@@ -456,11 +456,11 @@ impl Serializer {
     /// if you need to determine the capability to encode to JSON at runtime.
     pub fn to_json_value(&self, event: Event) -> Result<serde_json::Value, vector_common::Error> {
         match self {
-        Serializer::Gelf(serializer) => serializer.to_json_value(event),
-        Serializer::Json(serializer) => serializer.to_json_value(event),
-        Serializer::NativeJson(serializer) => serializer.to_json_value(event),
-        Serializer::Avro(_)
-        | Serializer::Cef(_)
+            Serializer::Gelf(serializer) => serializer.to_json_value(event),
+            Serializer::Json(serializer) => serializer.to_json_value(event),
+            Serializer::NativeJson(serializer) => serializer.to_json_value(event),
+            Serializer::Avro(_)
+            | Serializer::Cef(_)
             | Serializer::Csv(_)
             | Serializer::Logfmt(_)
             | Serializer::Text(_)
diff --git a/src/codecs/encoding/config.rs b/src/codecs/encoding/config.rs
index c87d5c5d6f28c..7f76a768b8f6a 100644
--- a/src/codecs/encoding/config.rs
+++ b/src/codecs/encoding/config.rs
@@ -149,9 +149,8 @@ impl EncodingConfigWithFraming {
             #[cfg(feature = "codecs-parquet")]
             SerializerConfig::Parquet { parquet } => {
                 let serializer = ParquetSerializer::new(parquet.clone())?;
-                let encoder = EncoderKind::Batch(BatchEncoder::new(BatchSerializer::Parquet(
-                    serializer,
-                )));
+                let encoder =
+                    EncoderKind::Batch(BatchEncoder::new(BatchSerializer::Parquet(serializer)));
                 Ok((self.transformer(), encoder))
             }
             _ => {

From b5d08591833469e0afa5c5ec9d1486703d386ed4 Mon Sep 17 00:00:00 2001
From: Rory Shanks <6383578+rorylshanks@users.noreply.github.com>
Date: Tue, 23 Dec 2025 13:18:52 +0100
Subject: [PATCH 10/13] Update Cargo.toml

Co-authored-by: Thomas <thomasqueirozb@gmail.com>
---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 6c6ee6e0391a4..32bb3e5a68807 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -496,7 +496,7 @@ ntapi = { git = "https://github.com/MSxDOS/ntapi.git", rev = "24fc1e47677fc9f6e3
 
 [features]
 # Default features for *-unknown-linux-gnu and *-apple-darwin
-default = ["api", "api-client", "codecs-parquet", "enrichment-tables", "sinks", "sources", "sources-dnstap", "transforms", "unix", "rdkafka?/gssapi-vendored", "secrets"]
+default = ["api", "api-client", "enrichment-tables", "sinks", "sources", "sources-dnstap", "transforms", "unix", "rdkafka?/gssapi-vendored", "secrets"]
 # Default features for `cargo docs`. The same as `default` but without `rdkafka?/gssapi-vendored` which would require installing libsasl in our doc build environment.
 docs = ["api", "api-client", "enrichment-tables", "sinks", "sources", "sources-dnstap", "transforms", "unix", "secrets"]
 # Default features for *-unknown-linux-* which make use of `cmake` for dependencies

From 861d9a4ca7b71b92ad4600523c407b4dffa2b8eb Mon Sep 17 00:00:00 2001
From: Rory Shanks <rory@posthog.com>
Date: Wed, 24 Dec 2025 12:02:28 +0000
Subject: [PATCH 11/13] Fixed docs and check failures

---
 Cargo.toml                                    |   2 +-
 lib/codecs/src/encoding/format/parquet.rs     |  26 +-
 lib/codecs/src/encoding/serializer.rs         |   9 +-
 scripts/generate-component-docs.rb            |  37 ++
 src/codecs/encoding/config.rs                 |   7 +-
 src/codecs/encoding/encoder.rs                |   4 +-
 src/sinks/clickhouse/config.rs                |  15 +-
 src/sinks/util/encoding.rs                    |   2 +-
 website/cue/reference.cue                     |   4 +-
 .../components/sinks/generated/amqp.cue       | 327 ------------------
 .../sinks/generated/aws_cloudwatch_logs.cue   | 327 ------------------
 .../sinks/generated/aws_kinesis_firehose.cue  | 327 ------------------
 .../sinks/generated/aws_kinesis_streams.cue   | 327 ------------------
 .../components/sinks/generated/aws_s3.cue     |  30 +-
 .../components/sinks/generated/aws_sns.cue    | 327 ------------------
 .../components/sinks/generated/aws_sqs.cue    | 327 ------------------
 .../components/sinks/generated/azure_blob.cue |  30 +-
 .../components/sinks/generated/clickhouse.cue |  14 +-
 .../components/sinks/generated/console.cue    | 327 ------------------
 .../components/sinks/generated/file.cue       | 327 ------------------
 .../generated/gcp_chronicle_unstructured.cue  | 327 ------------------
 .../sinks/generated/gcp_cloud_storage.cue     |  30 +-
 .../components/sinks/generated/gcp_pubsub.cue | 327 ------------------
 .../components/sinks/generated/http.cue       | 327 ------------------
 .../components/sinks/generated/humio_logs.cue | 327 ------------------
 .../components/sinks/generated/kafka.cue      | 327 ------------------
 .../components/sinks/generated/loki.cue       | 327 ------------------
 .../components/sinks/generated/mqtt.cue       | 327 ------------------
 .../components/sinks/generated/nats.cue       | 327 ------------------
 .../sinks/generated/opentelemetry.cue         | 327 ------------------
 .../components/sinks/generated/papertrail.cue | 327 ------------------
 .../components/sinks/generated/pulsar.cue     | 327 ------------------
 .../components/sinks/generated/redis.cue      | 327 ------------------
 .../components/sinks/generated/socket.cue     | 327 ------------------
 .../sinks/generated/splunk_hec_logs.cue       | 327 ------------------
 .../components/sinks/generated/webhdfs.cue    | 327 ------------------
 .../components/sinks/generated/websocket.cue  | 327 ------------------
 .../sinks/generated/websocket_server.cue      | 327 ------------------
 38 files changed, 123 insertions(+), 8262 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 32bb3e5a68807..bdd589c0bd94e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -845,7 +845,7 @@ sinks-aws_cloudwatch_logs = ["aws-core", "dep:aws-sdk-cloudwatchlogs", "dep:aws-
 sinks-aws_cloudwatch_metrics = ["aws-core", "dep:aws-sdk-cloudwatch"]
 sinks-aws_kinesis_firehose = ["aws-core", "dep:aws-sdk-firehose"]
 sinks-aws_kinesis_streams = ["aws-core", "dep:aws-sdk-kinesis"]
-sinks-aws_s3 = ["dep:base64", "dep:md-5", "aws-core", "dep:aws-sdk-s3"]
+sinks-aws_s3 = ["dep:base64", "dep:md-5", "aws-core", "dep:aws-sdk-s3", "codecs-parquet"]
 sinks-aws_sqs = ["aws-core", "dep:aws-sdk-sqs"]
 sinks-aws_sns = ["aws-core", "dep:aws-sdk-sns"]
 sinks-axiom = ["sinks-http"]
diff --git a/lib/codecs/src/encoding/format/parquet.rs b/lib/codecs/src/encoding/format/parquet.rs
index edea26c5dac19..b7688622a7275 100644
--- a/lib/codecs/src/encoding/format/parquet.rs
+++ b/lib/codecs/src/encoding/format/parquet.rs
@@ -44,7 +44,7 @@ pub enum ParquetCompression {
 
 impl ParquetCompression {
     /// Convert to parquet Compression with optional level override
-    fn to_compression(&self, level: Option<i32>) -> Result<Compression, String> {
+    fn to_compression(self, level: Option<i32>) -> Result<Compression, String> {
         match (self, level) {
             (ParquetCompression::Uncompressed, _) => Ok(Compression::UNCOMPRESSED),
             (ParquetCompression::Snappy, _) => Ok(Compression::SNAPPY),
@@ -419,9 +419,7 @@ impl ParquetSerializer {
     /// Create a new ParquetSerializer with the given configuration
     pub fn new(config: ParquetSerializerConfig) -> Result<Self, vector_common::Error> {
         // Validate configuration
-        config
-            .validate()
-            .map_err(|e| vector_common::Error::from(e))?;
+        config.validate().map_err(vector_common::Error::from)?;
 
         // Keep a copy of schema_def for later use with Bloom filters
         let schema_def_opt = config.schema.clone();
@@ -465,7 +463,7 @@ impl ParquetSerializer {
         let compression = config
             .compression
             .to_compression(config.compression_level)
-            .map_err(|e| vector_common::Error::from(e))?;
+            .map_err(vector_common::Error::from)?;
 
         tracing::debug!(
             compression = ?config.compression,
@@ -711,7 +709,7 @@ fn infer_schema_from_events(
                 continue;
             }
 
-            let inferred_type = infer_arrow_type(&value);
+            let inferred_type = infer_arrow_type(value);
 
             match field_types.get(&key_str) {
                 None => {
@@ -827,7 +825,7 @@ mod tests {
             .set_compression(Compression::SNAPPY)
             .build();
 
-        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
         assert!(result.is_ok());
 
         let bytes = result.unwrap();
@@ -931,7 +929,7 @@ mod tests {
 
         let props = WriterProperties::builder().build();
 
-        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
         assert!(result.is_ok());
 
         let bytes = result.unwrap();
@@ -971,7 +969,7 @@ mod tests {
             true,
         )]));
         let props = WriterProperties::builder().build();
-        let result = encode_events_to_parquet(&events, schema, &props, None);
+        let result = encode_events_to_parquet(&events, schema, &props);
         assert!(result.is_err());
         assert!(matches!(
             result.unwrap_err(),
@@ -1004,7 +1002,7 @@ mod tests {
                 .set_compression(compression.into())
                 .build();
 
-            let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
+            let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
             assert!(result.is_ok(), "Failed with compression: {:?}", compression);
 
             // Verify we can read it back
@@ -1021,7 +1019,7 @@ mod tests {
 
     #[test]
     fn test_parquet_serializer_config() {
-        use super::schema_definition::FieldDefinition;
+        use super::super::schema_definition::FieldDefinition;
         use std::collections::BTreeMap;
 
         let mut fields = BTreeMap::new();
@@ -1073,7 +1071,7 @@ mod tests {
 
     #[test]
     fn test_encoder_trait_implementation() {
-        use super::schema_definition::FieldDefinition;
+        use super::super::schema_definition::FieldDefinition;
         use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
 
@@ -1139,7 +1137,7 @@ mod tests {
             .set_max_row_group_size(5000) // 2 row groups
             .build();
 
-        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props, None);
+        let result = encode_events_to_parquet(&events, Arc::clone(&schema), &props);
         assert!(result.is_ok());
 
         let bytes = result.unwrap();
@@ -1155,7 +1153,7 @@ mod tests {
 
     #[test]
     fn test_allow_nullable_fields_config() {
-        use super::schema_definition::FieldDefinition;
+        use super::super::schema_definition::FieldDefinition;
         use std::collections::BTreeMap;
         use tokio_util::codec::Encoder;
 
diff --git a/lib/codecs/src/encoding/serializer.rs b/lib/codecs/src/encoding/serializer.rs
index 4bf0a7c682c06..b40f6a3d742bb 100644
--- a/lib/codecs/src/encoding/serializer.rs
+++ b/lib/codecs/src/encoding/serializer.rs
@@ -307,12 +307,9 @@ impl SerializerConfig {
             }
             SerializerConfig::Text(config) => Ok(Serializer::Text(config.build())),
             #[cfg(feature = "parquet")]
-            SerializerConfig::Parquet { .. } => Err(
-                VectorError::from(
-                    "Parquet codec is available only for batch encoding and cannot be built as a framed serializer.",
-                )
-                .into(),
-            ),
+            SerializerConfig::Parquet { .. } => Err(VectorError::from(
+                "Parquet codec is available only for batch encoding and cannot be built as a framed serializer.",
+            )),
         }
     }
 
diff --git a/scripts/generate-component-docs.rb b/scripts/generate-component-docs.rb
index 5506212d1b30c..ed2b90532d54c 100755
--- a/scripts/generate-component-docs.rb
+++ b/scripts/generate-component-docs.rb
@@ -1666,6 +1666,39 @@ def unwrap_resolved_schema(root_schema, schema_name, friendly_name)
   return sort_hash_nested(unwrapped_resolved_schema)
 end
 
+PARQUET_ALLOWED_SINKS = %w[aws_s3 gcp_cloud_storage azure_blob].freeze
+
+def remove_parquet_from_codec_config!(schema, field_name)
+  field = schema[field_name]
+  return if field.nil?
+
+  field_options = field.dig('type', 'object', 'options')
+  return if field_options.nil?
+
+  field_options.delete('parquet')
+
+  codec = field_options['codec']
+  codec_enum = codec.dig('type', 'string', 'enum') if codec.is_a?(Hash)
+  if codec_enum.is_a?(Hash)
+    codec_enum.delete('parquet')
+  elsif codec_enum.is_a?(Array)
+    codec_enum.delete('parquet')
+  end
+end
+
+def prune_parquet_from_schema!(schema)
+  return unless schema.is_a?(Hash)
+
+  schema.each do |field_name, field_def|
+    if %w[encoding batch_encoding].include?(field_name)
+      remove_parquet_from_codec_config!(schema, field_name)
+    end
+
+    options = field_def.dig('type', 'object', 'options')
+    prune_parquet_from_schema!(options) if options.is_a?(Hash)
+  end
+end
+
 def render_and_import_schema(unwrapped_resolved_schema, friendly_name, config_map_path, cue_relative_path)
 
   # Set up the appropriate structure for the value based on the configuration map path. It defines
@@ -1714,6 +1747,10 @@ def render_and_import_generated_component_schema(root_schema, schema_name, compo
 def render_and_import_component_schema(root_schema, schema_name, component_type, component_name)
   friendly_name = "'#{component_name}' #{component_type} configuration"
   unwrapped_resolved_schema = unwrap_resolved_schema(root_schema, schema_name, friendly_name)
+  unwrapped_resolved_schema = deep_copy(unwrapped_resolved_schema)
+  if component_type == 'sink' && !PARQUET_ALLOWED_SINKS.include?(component_name)
+    prune_parquet_from_schema!(unwrapped_resolved_schema)
+  end
   render_and_import_schema(
     unwrapped_resolved_schema,
     friendly_name,
diff --git a/src/codecs/encoding/config.rs b/src/codecs/encoding/config.rs
index 7f76a768b8f6a..63a00f959b887 100644
--- a/src/codecs/encoding/config.rs
+++ b/src/codecs/encoding/config.rs
@@ -1,4 +1,4 @@
-#[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
+#[cfg(feature = "codecs-parquet")]
 use crate::codecs::{BatchEncoder, BatchSerializer};
 use crate::codecs::{Encoder, EncoderKind, Transformer};
 #[cfg(feature = "codecs-parquet")]
@@ -149,8 +149,9 @@ impl EncodingConfigWithFraming {
             #[cfg(feature = "codecs-parquet")]
             SerializerConfig::Parquet { parquet } => {
                 let serializer = ParquetSerializer::new(parquet.clone())?;
-                let encoder =
-                    EncoderKind::Batch(BatchEncoder::new(BatchSerializer::Parquet(serializer)));
+                let encoder = EncoderKind::Batch(Box::new(BatchEncoder::new(
+                    BatchSerializer::Parquet(Box::new(serializer)),
+                )));
                 Ok((self.transformer(), encoder))
             }
             _ => {
diff --git a/src/codecs/encoding/encoder.rs b/src/codecs/encoding/encoder.rs
index a88a2275911b8..4875200836244 100644
--- a/src/codecs/encoding/encoder.rs
+++ b/src/codecs/encoding/encoder.rs
@@ -22,7 +22,7 @@ pub enum BatchSerializer {
     Arrow(ArrowStreamSerializer),
     /// Parquet columnar format serializer.
     #[cfg(feature = "codecs-parquet")]
-    Parquet(ParquetSerializer),
+    Parquet(Box<ParquetSerializer>),
 }
 
 /// An encoder that encodes batches of events.
@@ -103,7 +103,7 @@ pub enum EncoderKind {
     Framed(Box<Encoder<Framer>>),
     /// Encodes events in batches without framing
     #[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
-    Batch(BatchEncoder),
+    Batch(Box<BatchEncoder>),
 }
 
 #[derive(Debug, Clone)]
diff --git a/src/sinks/clickhouse/config.rs b/src/sinks/clickhouse/config.rs
index 6ff081e9aafa0..5421d4a30d100 100644
--- a/src/sinks/clickhouse/config.rs
+++ b/src/sinks/clickhouse/config.rs
@@ -5,7 +5,9 @@ use std::fmt;
 use http::{Request, StatusCode, Uri};
 use hyper::Body;
 use vector_lib::codecs::encoding::format::SchemaProvider;
-use vector_lib::codecs::encoding::{ArrowStreamSerializerConfig, BatchSerializerConfig};
+use vector_lib::codecs::encoding::{
+    ArrowStreamSerializer, ArrowStreamSerializerConfig, BatchSerializerConfig,
+};
 
 use super::{
     request_builder::ClickhouseRequestBuilder,
@@ -293,6 +295,12 @@ impl ClickhouseConfig {
 
             let mut arrow_config = match batch_encoding {
                 BatchSerializerConfig::ArrowStream(config) => config.clone(),
+                #[cfg(feature = "codecs-parquet")]
+                BatchSerializerConfig::Parquet { .. } => {
+                    return Err(
+                        "'batch_encoding' does not support Parquet for the ClickHouse sink.".into(),
+                    );
+                }
             };
 
             self.resolve_arrow_schema(
@@ -304,10 +312,9 @@ impl ClickhouseConfig {
             )
             .await?;
 
-            let resolved_batch_config = BatchSerializerConfig::ArrowStream(arrow_config);
-            let arrow_serializer = resolved_batch_config.build()?;
+            let arrow_serializer = ArrowStreamSerializer::new(arrow_config)?;
             let batch_serializer = BatchSerializer::Arrow(arrow_serializer);
-            let encoder = EncoderKind::Batch(BatchEncoder::new(batch_serializer));
+            let encoder = EncoderKind::Batch(Box::new(BatchEncoder::new(batch_serializer)));
 
             return Ok((Format::ArrowStream, encoder));
         }
diff --git a/src/sinks/util/encoding.rs b/src/sinks/util/encoding.rs
index 39162ed3065b6..15b37940409e0 100644
--- a/src/sinks/util/encoding.rs
+++ b/src/sinks/util/encoding.rs
@@ -152,7 +152,7 @@ impl Encoder<Vec<Event>> for (Transformer, crate::codecs::EncoderKind) {
             }
             #[cfg(any(feature = "codecs-arrow", feature = "codecs-parquet"))]
             crate::codecs::EncoderKind::Batch(encoder) => {
-                (self.0.clone(), encoder.clone()).encode_input(events, writer)
+                (self.0.clone(), *encoder.clone()).encode_input(events, writer)
             }
         }
     }
diff --git a/website/cue/reference.cue b/website/cue/reference.cue
index 05d22f27e1f1a..844da8cb118c0 100644
--- a/website/cue/reference.cue
+++ b/website/cue/reference.cue
@@ -56,7 +56,7 @@ _values: {
 // * `removed` - The component has been removed.
 #DevelopmentStatus: "beta" | "stable" | "deprecated" | "removed"
 
-#EncodingCodec: "json" | "logfmt" | "text" | "csv" | "native" | "native_json" | "avro" | "gelf"
+#EncodingCodec: "json" | "logfmt" | "text" | "csv" | "native" | "native_json" | "avro" | "gelf" | "parquet"
 
 #Endpoint: {
 	description: string
@@ -556,6 +556,8 @@ _values: {
 }
 
 #TypeObject: {
+	_args: required: bool
+
 	// `examples` clarify values through examples. This should be used
 	// when examples cannot be derived from the `default` or `enum`
 	// options.
diff --git a/website/cue/reference/components/sinks/generated/amqp.cue b/website/cue/reference/components/sinks/generated/amqp.cue
index 43b62c7ef3705..66fb1312c5695 100644
--- a/website/cue/reference/components/sinks/generated/amqp.cue
+++ b/website/cue/reference/components/sinks/generated/amqp.cue
@@ -213,15 +213,6 @@ generated: components: sinks: amqp: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -385,324 +376,6 @@ generated: components: sinks: amqp: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue b/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue
index 2e6ef6ad800c8..12686c9b27b65 100644
--- a/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue
+++ b/website/cue/reference/components/sinks/generated/aws_cloudwatch_logs.cue
@@ -409,15 +409,6 @@ generated: components: sinks: aws_cloudwatch_logs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -581,324 +572,6 @@ generated: components: sinks: aws_cloudwatch_logs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue b/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue
index cd0db2c964c77..da311f458462e 100644
--- a/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue
+++ b/website/cue/reference/components/sinks/generated/aws_kinesis_firehose.cue
@@ -388,15 +388,6 @@ generated: components: sinks: aws_kinesis_firehose: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -560,324 +551,6 @@ generated: components: sinks: aws_kinesis_firehose: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue b/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue
index dcf0475017bfe..4a800fa6e35da 100644
--- a/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue
+++ b/website/cue/reference/components/sinks/generated/aws_kinesis_streams.cue
@@ -388,15 +388,6 @@ generated: components: sinks: aws_kinesis_streams: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -560,324 +551,6 @@ generated: components: sinks: aws_kinesis_streams: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_s3.cue b/website/cue/reference/components/sinks/generated/aws_s3.cue
index 56cdb39d51a79..fd9397f4f9da8 100644
--- a/website/cue/reference/components/sinks/generated/aws_s3.cue
+++ b/website/cue/reference/components/sinks/generated/aws_s3.cue
@@ -831,22 +831,22 @@ generated: components: sinks: aws_s3: configuration: {
 						type: object: {
 							examples: [{
 								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
+									bloom_filter:                     false
+									bloom_filter_false_positive_pct:  null
+									bloom_filter_num_distinct_values: null
+									type:                             "int64"
 								}
 								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
+									bloom_filter:                     true
+									bloom_filter_false_positive_pct:  0.01
+									bloom_filter_num_distinct_values: 1000000
+									type:                             "utf8"
 								}
 								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
+									bloom_filter:                     false
+									bloom_filter_false_positive_pct:  null
+									bloom_filter_num_distinct_values: null
+									type:                             "timestamp_microsecond"
 								}
 							}]
 							options: "*": {
@@ -867,9 +867,9 @@ generated: components: sinks: aws_s3: configuration: {
 											examples: [true]
 										}
 									}
-									bloom_filter_fpp: {
+									bloom_filter_false_positive_pct: {
 										description: """
-																								False positive probability for this column's Bloom filter
+																								False positive probability for this column's Bloom filter (as a percentage)
 
 																								Lower values create larger but more accurate filters.
 
@@ -879,7 +879,7 @@ generated: components: sinks: aws_s3: configuration: {
 										required: false
 										type: float: examples: [0.05, 0.01]
 									}
-									bloom_filter_ndv: {
+									bloom_filter_num_distinct_values: {
 										description: """
 																								Number of distinct values expected for this column's Bloom filter
 
diff --git a/website/cue/reference/components/sinks/generated/aws_sns.cue b/website/cue/reference/components/sinks/generated/aws_sns.cue
index c17532a348f9f..1d2413066b071 100644
--- a/website/cue/reference/components/sinks/generated/aws_sns.cue
+++ b/website/cue/reference/components/sinks/generated/aws_sns.cue
@@ -319,15 +319,6 @@ generated: components: sinks: aws_sns: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -491,324 +482,6 @@ generated: components: sinks: aws_sns: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/aws_sqs.cue b/website/cue/reference/components/sinks/generated/aws_sqs.cue
index 0b047cae1a45b..912e1f2c43ea6 100644
--- a/website/cue/reference/components/sinks/generated/aws_sqs.cue
+++ b/website/cue/reference/components/sinks/generated/aws_sqs.cue
@@ -319,15 +319,6 @@ generated: components: sinks: aws_sqs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -491,324 +482,6 @@ generated: components: sinks: aws_sqs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/azure_blob.cue b/website/cue/reference/components/sinks/generated/azure_blob.cue
index fc99f7b11486c..9dc726f84f2c7 100644
--- a/website/cue/reference/components/sinks/generated/azure_blob.cue
+++ b/website/cue/reference/components/sinks/generated/azure_blob.cue
@@ -677,22 +677,22 @@ generated: components: sinks: azure_blob: configuration: {
 						type: object: {
 							examples: [{
 								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
+									bloom_filter:                     false
+									bloom_filter_false_positive_pct:  null
+									bloom_filter_num_distinct_values: null
+									type:                             "int64"
 								}
 								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
+									bloom_filter:                     true
+									bloom_filter_false_positive_pct:  0.01
+									bloom_filter_num_distinct_values: 1000000
+									type:                             "utf8"
 								}
 								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
+									bloom_filter:                     false
+									bloom_filter_false_positive_pct:  null
+									bloom_filter_num_distinct_values: null
+									type:                             "timestamp_microsecond"
 								}
 							}]
 							options: "*": {
@@ -713,9 +713,9 @@ generated: components: sinks: azure_blob: configuration: {
 											examples: [true]
 										}
 									}
-									bloom_filter_fpp: {
+									bloom_filter_false_positive_pct: {
 										description: """
-																								False positive probability for this column's Bloom filter
+																								False positive probability for this column's Bloom filter (as a percentage)
 
 																								Lower values create larger but more accurate filters.
 
@@ -725,7 +725,7 @@ generated: components: sinks: azure_blob: configuration: {
 										required: false
 										type: float: examples: [0.05, 0.01]
 									}
-									bloom_filter_ndv: {
+									bloom_filter_num_distinct_values: {
 										description: """
 																								Number of distinct values expected for this column's Bloom filter
 
diff --git a/website/cue/reference/components/sinks/generated/clickhouse.cue b/website/cue/reference/components/sinks/generated/clickhouse.cue
index a0f4399bdc112..7141f9652815f 100644
--- a/website/cue/reference/components/sinks/generated/clickhouse.cue
+++ b/website/cue/reference/components/sinks/generated/clickhouse.cue
@@ -263,19 +263,13 @@ generated: components: sinks: clickhouse: configuration: {
 					When disabled (default), missing values for non-nullable fields will cause encoding errors,
 					ensuring all required data is present before sending to the sink.
 					"""
-				required: false
+				relevant_when: "codec = \"arrow_stream\""
+				required:      false
 				type: bool: default: false
 			}
 			codec: {
-				description: """
-					Encodes events in [Apache Arrow][apache_arrow] IPC streaming format.
-
-					This is the streaming variant of the Arrow IPC format, which writes
-					a continuous stream of record batches.
-
-					[apache_arrow]: https://arrow.apache.org/
-					"""
-				required: true
+				description: "The codec to use for batch encoding events."
+				required:    true
 				type: string: enum: arrow_stream: """
 					Encodes events in [Apache Arrow][apache_arrow] IPC streaming format.
 
diff --git a/website/cue/reference/components/sinks/generated/console.cue b/website/cue/reference/components/sinks/generated/console.cue
index 795ea8adeb382..16545964f6950 100644
--- a/website/cue/reference/components/sinks/generated/console.cue
+++ b/website/cue/reference/components/sinks/generated/console.cue
@@ -197,15 +197,6 @@ generated: components: sinks: console: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -369,324 +360,6 @@ generated: components: sinks: console: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/file.cue b/website/cue/reference/components/sinks/generated/file.cue
index 731adbd5303f1..3135f4b70db5f 100644
--- a/website/cue/reference/components/sinks/generated/file.cue
+++ b/website/cue/reference/components/sinks/generated/file.cue
@@ -217,15 +217,6 @@ generated: components: sinks: file: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -389,324 +380,6 @@ generated: components: sinks: file: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue b/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue
index bd17da68ca422..0a027dfa45a74 100644
--- a/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue
+++ b/website/cue/reference/components/sinks/generated/gcp_chronicle_unstructured.cue
@@ -285,15 +285,6 @@ generated: components: sinks: gcp_chronicle_unstructured: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -457,324 +448,6 @@ generated: components: sinks: gcp_chronicle_unstructured: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue b/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue
index 785e1879f463d..59f01853124a1 100644
--- a/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue
+++ b/website/cue/reference/components/sinks/generated/gcp_cloud_storage.cue
@@ -692,22 +692,22 @@ generated: components: sinks: gcp_cloud_storage: configuration: {
 						type: object: {
 							examples: [{
 								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
+									bloom_filter:                     false
+									bloom_filter_false_positive_pct:  null
+									bloom_filter_num_distinct_values: null
+									type:                             "int64"
 								}
 								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
+									bloom_filter:                     true
+									bloom_filter_false_positive_pct:  0.01
+									bloom_filter_num_distinct_values: 1000000
+									type:                             "utf8"
 								}
 								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
+									bloom_filter:                     false
+									bloom_filter_false_positive_pct:  null
+									bloom_filter_num_distinct_values: null
+									type:                             "timestamp_microsecond"
 								}
 							}]
 							options: "*": {
@@ -728,9 +728,9 @@ generated: components: sinks: gcp_cloud_storage: configuration: {
 											examples: [true]
 										}
 									}
-									bloom_filter_fpp: {
+									bloom_filter_false_positive_pct: {
 										description: """
-																								False positive probability for this column's Bloom filter
+																								False positive probability for this column's Bloom filter (as a percentage)
 
 																								Lower values create larger but more accurate filters.
 
@@ -740,7 +740,7 @@ generated: components: sinks: gcp_cloud_storage: configuration: {
 										required: false
 										type: float: examples: [0.05, 0.01]
 									}
-									bloom_filter_ndv: {
+									bloom_filter_num_distinct_values: {
 										description: """
 																								Number of distinct values expected for this column's Bloom filter
 
diff --git a/website/cue/reference/components/sinks/generated/gcp_pubsub.cue b/website/cue/reference/components/sinks/generated/gcp_pubsub.cue
index e0aa0fd3faf6e..385bf9dd13793 100644
--- a/website/cue/reference/components/sinks/generated/gcp_pubsub.cue
+++ b/website/cue/reference/components/sinks/generated/gcp_pubsub.cue
@@ -264,15 +264,6 @@ generated: components: sinks: gcp_pubsub: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -436,324 +427,6 @@ generated: components: sinks: gcp_pubsub: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/http.cue b/website/cue/reference/components/sinks/generated/http.cue
index 3c41e7ebad7b3..cc4da5bf49425 100644
--- a/website/cue/reference/components/sinks/generated/http.cue
+++ b/website/cue/reference/components/sinks/generated/http.cue
@@ -447,15 +447,6 @@ generated: components: sinks: http: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -619,324 +610,6 @@ generated: components: sinks: http: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/humio_logs.cue b/website/cue/reference/components/sinks/generated/humio_logs.cue
index 9f8dc77ad41cc..b9e46513d8b9c 100644
--- a/website/cue/reference/components/sinks/generated/humio_logs.cue
+++ b/website/cue/reference/components/sinks/generated/humio_logs.cue
@@ -263,15 +263,6 @@ generated: components: sinks: humio_logs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -435,324 +426,6 @@ generated: components: sinks: humio_logs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/kafka.cue b/website/cue/reference/components/sinks/generated/kafka.cue
index 8ca8a02297a7c..9b3ef74b7dd76 100644
--- a/website/cue/reference/components/sinks/generated/kafka.cue
+++ b/website/cue/reference/components/sinks/generated/kafka.cue
@@ -252,15 +252,6 @@ generated: components: sinks: kafka: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -424,324 +415,6 @@ generated: components: sinks: kafka: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/loki.cue b/website/cue/reference/components/sinks/generated/loki.cue
index d9c3674435a8f..933394342d1bd 100644
--- a/website/cue/reference/components/sinks/generated/loki.cue
+++ b/website/cue/reference/components/sinks/generated/loki.cue
@@ -449,15 +449,6 @@ generated: components: sinks: loki: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -621,324 +612,6 @@ generated: components: sinks: loki: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/mqtt.cue b/website/cue/reference/components/sinks/generated/mqtt.cue
index 0ad10982b42ae..980996cff164d 100644
--- a/website/cue/reference/components/sinks/generated/mqtt.cue
+++ b/website/cue/reference/components/sinks/generated/mqtt.cue
@@ -207,15 +207,6 @@ generated: components: sinks: mqtt: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -379,324 +370,6 @@ generated: components: sinks: mqtt: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/nats.cue b/website/cue/reference/components/sinks/generated/nats.cue
index 8a0899a1db6bf..0979492903e9c 100644
--- a/website/cue/reference/components/sinks/generated/nats.cue
+++ b/website/cue/reference/components/sinks/generated/nats.cue
@@ -297,15 +297,6 @@ generated: components: sinks: nats: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -469,324 +460,6 @@ generated: components: sinks: nats: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/opentelemetry.cue b/website/cue/reference/components/sinks/generated/opentelemetry.cue
index e70b8ebd8faee..897288dbdfe72 100644
--- a/website/cue/reference/components/sinks/generated/opentelemetry.cue
+++ b/website/cue/reference/components/sinks/generated/opentelemetry.cue
@@ -450,15 +450,6 @@ generated: components: sinks: opentelemetry: configuration: protocol: {
 
 																			[otlp]: https://opentelemetry.io/docs/specs/otlp/
 																			"""
-						parquet: """
-																			Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-																			Parquet is a columnar storage format optimized for analytics workloads.
-																			It provides efficient compression and encoding schemes, making it ideal
-																			for long-term storage and query performance.
-
-																			[apache_parquet]: https://parquet.apache.org/
-																			"""
 						protobuf: """
 																			Encodes an event as a [Protobuf][protobuf] message.
 
@@ -622,324 +613,6 @@ generated: components: sinks: opentelemetry: configuration: protocol: {
 					required:    false
 					type: array: items: type: string: {}
 				}
-				parquet: {
-					description:   "Apache Parquet-specific encoder options."
-					relevant_when: "codec = \"parquet\""
-					required:      true
-					type: object: options: {
-						allow_nullable_fields: {
-							description: """
-																				Allow null values for non-nullable fields in the schema.
-
-																				When enabled, missing or incompatible values will be encoded as null even for fields
-																				marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																				systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																				When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																				ensuring all required data is present before writing to Parquet.
-																				"""
-							required: false
-							type: bool: {
-								default: false
-								examples: [true]
-							}
-						}
-						compression: {
-							description: """
-																				Compression algorithm to use for Parquet columns
-
-																				Compression is applied to all columns in the Parquet file.
-																				Snappy provides a good balance of speed and compression ratio.
-																				"""
-							required: false
-							type: string: {
-								default: "snappy"
-								enum: {
-									brotli:       "Brotli compression"
-									gzip:         "GZIP compression (slower, better compression ratio)"
-									lz4:          "LZ4 compression (very fast, moderate compression)"
-									snappy:       "Snappy compression (fast, moderate compression ratio)"
-									uncompressed: "No compression"
-									zstd:         "ZSTD compression (good balance of speed and compression)"
-								}
-								examples: ["snappy", "gzip", "zstd"]
-							}
-						}
-						compression_level: {
-							description: """
-																				Compression level for algorithms that support it.
-
-																				Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																				**ZSTD levels** (1-22):
-																				- 1-3: Fastest, moderate compression (level 3 is default)
-																				- 4-9: Good balance of speed and compression
-																				- 10-15: Better compression, slower encoding
-																				- 16-22: Maximum compression, slowest (good for cold storage)
-
-																				**GZIP levels** (1-9):
-																				- 1-3: Faster, less compression
-																				- 6: Default balance (recommended)
-																				- 9: Maximum compression, slowest
-
-																				**Brotli levels** (0-11):
-																				- 0-4: Faster encoding
-																				- 1: Default (recommended)
-																				- 5-11: Better compression, slower
-
-																				Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																				Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																				"""
-							required: false
-							type: int: examples: [3, 6, 10]
-						}
-						exclude_columns: {
-							description: """
-																				Column names to exclude from Parquet encoding
-
-																				These columns will be completely excluded from the Parquet file.
-																				Useful for filtering out metadata, internal fields, or temporary data.
-
-																				Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																				"""
-							required: false
-							type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-						}
-						infer_schema: {
-							description: """
-																				Automatically infer schema from event data
-
-																				When enabled, the schema is inferred from each batch of events independently.
-																				The schema is determined by examining the types of values in the events.
-
-																				**Type mapping:**
-																				- String values → `utf8`
-																				- Integer values → `int64`
-																				- Float values → `float64`
-																				- Boolean values → `boolean`
-																				- Timestamp values → `timestamp_microsecond`
-																				- Arrays/Objects → `utf8` (serialized as JSON)
-
-																				**Type conflicts:** If a field has different types across events in the same batch,
-																				it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																				**Important:** Schema consistency across batches is the operator's responsibility.
-																				Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																				a different schema if event structure varies.
-
-																				**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																				Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																				"""
-							required: false
-							type: bool: {
-								default: false
-								examples: [true]
-							}
-						}
-						max_columns: {
-							description: """
-																				Maximum number of columns to encode
-
-																				Limits the number of columns in the Parquet file. Additional columns beyond
-																				this limit will be silently dropped. Columns are selected in the order they
-																				appear in the first event.
-
-																				Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																				"""
-							required: false
-							type: uint: {
-								default: 1000
-								examples: [500, 1000]
-							}
-						}
-						row_group_size: {
-							description: """
-																				Number of rows per row group
-
-																				Row groups are Parquet's unit of parallelization. Larger row groups
-																				can improve compression but increase memory usage during encoding.
-
-																				Since each batch becomes a separate Parquet file, this value
-																				should be <= the batch max_events setting. Row groups cannot span multiple files.
-																				If not specified, defaults to the batch size.
-																				"""
-							required: false
-							type: uint: examples: [100000, 1000000]
-						}
-						schema: {
-							description: """
-																				The Arrow schema definition to use for encoding
-
-																				This schema defines the structure and types of the Parquet file columns.
-																				Specified as a map of field names to data types.
-
-																				Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																				Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																				float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																				timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																				"""
-							required: false
-							type: object: {
-								examples: [{
-									id: {
-										bloom_filter:     false
-										bloom_filter_fpp: null
-										bloom_filter_ndv: null
-										type:             "int64"
-									}
-									name: {
-										bloom_filter:     true
-										bloom_filter_fpp: 0.01
-										bloom_filter_ndv: 1000000
-										type:             "utf8"
-									}
-									timestamp: {
-										bloom_filter:     false
-										bloom_filter_fpp: null
-										bloom_filter_ndv: null
-										type:             "timestamp_microsecond"
-									}
-								}]
-								options: "*": {
-									description: "A field definition specifying the data type and optional Bloom filter configuration."
-									required:    true
-									type: object: options: {
-										bloom_filter: {
-											description: """
-																												Enable Bloom filter for this specific column
-
-																												When enabled, a Bloom filter will be created for this column to improve
-																												query performance for point lookups and IN clauses. Only enable for
-																												high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																												"""
-											required: false
-											type: bool: {
-												default: false
-												examples: [true]
-											}
-										}
-										bloom_filter_fpp: {
-											description: """
-																												False positive probability for this column's Bloom filter
-
-																												Lower values create larger but more accurate filters.
-
-																												- 0.05 (5%): Good balance for general use
-																												- 0.01 (1%): Better for high-selectivity queries
-																												"""
-											required: false
-											type: float: examples: [0.05, 0.01]
-										}
-										bloom_filter_ndv: {
-											description: """
-																												Number of distinct values expected for this column's Bloom filter
-
-																												This controls the size of the Bloom filter. Should match the actual
-																												cardinality of the column. Will be automatically capped to the batch size.
-
-																												- Low cardinality (countries, states): 1,000 - 100,000
-																												- Medium cardinality (cities, products): 100,000 - 1,000,000
-																												- High cardinality (UUIDs, user IDs): 10,000,000+
-																												"""
-											required: false
-											type: uint: examples: [1000000, 10000000]
-										}
-										type: {
-											description: "Data type for this field"
-											required:    true
-											type: string: examples: ["utf8", "int64", "timestamp_ms"]
-										}
-									}
-								}
-							}
-						}
-						sorting_columns: {
-							description: """
-																				Sorting order for rows within row groups.
-
-																				Pre-sorting rows by specified columns before writing can significantly improve both
-																				compression ratios and query performance. This is especially valuable for time-series
-																				data and event logs.
-
-																				**Benefits:**
-																				- **Better compression** (20-40% smaller files): Similar values are grouped together
-																				- **Faster queries**: More effective min/max statistics enable better row group skipping
-																				- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																				**Common patterns:**
-																				- Time-series: Sort by timestamp descending (most recent first)
-																				- Multi-tenant: Sort by tenant_id, then timestamp
-																				- User analytics: Sort by user_id, then event_time
-
-																				**Trade-offs:**
-																				- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																				- Requires buffering entire batch in memory for sorting
-																				- Most beneficial when queries frequently filter on sorted columns
-
-																				**Example:**
-																				```yaml
-																				sorting_columns:
-																				  - column: timestamp
-																				    descending: true
-																				  - column: user_id
-																				    descending: false
-																				```
-
-																				If not specified, rows are written in the order they appear in the batch.
-																				"""
-							required: false
-							type: array: items: type: object: options: {
-								column: {
-									description: "Name of the column to sort by"
-									required:    true
-									type: string: examples: ["timestamp", "user_id"]
-								}
-								descending: {
-									description: """
-																											Sort in descending order (true) or ascending order (false)
-
-																											- `true`: Descending (Z-A, 9-0, newest-oldest)
-																											- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																											"""
-									required: false
-									type: bool: {
-										default: false
-										examples: [true]
-									}
-								}
-							}
-						}
-						writer_version: {
-							description: """
-																				Parquet format writer version.
-
-																				Controls which Parquet format version to write:
-																				- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																				- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																				Version 2 benefits:
-																				- More efficient encoding for certain data types (10-20% smaller files)
-																				- Better statistics for query optimization
-																				- Improved data page format
-																				- Required for some advanced features
-
-																				Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																				Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																				"""
-							required: false
-							type: string: {
-								default: "v2"
-								enum: {
-									v1: "Parquet format version 1.0 (maximum compatibility)"
-									v2: "Parquet format version 2.0 (modern format with better encoding)"
-								}
-								examples: ["v1", "v2"]
-							}
-						}
-					}
-				}
 				protobuf: {
 					description:   "Options for the Protobuf serializer."
 					relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/papertrail.cue b/website/cue/reference/components/sinks/generated/papertrail.cue
index 0b1ed7cc43906..b69042f48b2ff 100644
--- a/website/cue/reference/components/sinks/generated/papertrail.cue
+++ b/website/cue/reference/components/sinks/generated/papertrail.cue
@@ -197,15 +197,6 @@ generated: components: sinks: papertrail: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -369,324 +360,6 @@ generated: components: sinks: papertrail: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/pulsar.cue b/website/cue/reference/components/sinks/generated/pulsar.cue
index bc71f124b89be..cc2cb6c90cb3a 100644
--- a/website/cue/reference/components/sinks/generated/pulsar.cue
+++ b/website/cue/reference/components/sinks/generated/pulsar.cue
@@ -331,15 +331,6 @@ generated: components: sinks: pulsar: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -503,324 +494,6 @@ generated: components: sinks: pulsar: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/redis.cue b/website/cue/reference/components/sinks/generated/redis.cue
index a3d98de388e65..fdd5686420da8 100644
--- a/website/cue/reference/components/sinks/generated/redis.cue
+++ b/website/cue/reference/components/sinks/generated/redis.cue
@@ -256,15 +256,6 @@ generated: components: sinks: redis: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -428,324 +419,6 @@ generated: components: sinks: redis: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/socket.cue b/website/cue/reference/components/sinks/generated/socket.cue
index 11dc80283e144..17d6e8cf7d4d0 100644
--- a/website/cue/reference/components/sinks/generated/socket.cue
+++ b/website/cue/reference/components/sinks/generated/socket.cue
@@ -209,15 +209,6 @@ generated: components: sinks: socket: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -381,324 +372,6 @@ generated: components: sinks: socket: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue b/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue
index a799b603b8d87..ce87f34e9ff83 100644
--- a/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue
+++ b/website/cue/reference/components/sinks/generated/splunk_hec_logs.cue
@@ -313,15 +313,6 @@ generated: components: sinks: splunk_hec_logs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -485,324 +476,6 @@ generated: components: sinks: splunk_hec_logs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/webhdfs.cue b/website/cue/reference/components/sinks/generated/webhdfs.cue
index 56630d46f67e3..236bd9f491f33 100644
--- a/website/cue/reference/components/sinks/generated/webhdfs.cue
+++ b/website/cue/reference/components/sinks/generated/webhdfs.cue
@@ -263,15 +263,6 @@ generated: components: sinks: webhdfs: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -435,324 +426,6 @@ generated: components: sinks: webhdfs: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/websocket.cue b/website/cue/reference/components/sinks/generated/websocket.cue
index f0d1f4264fca9..146bba2dada78 100644
--- a/website/cue/reference/components/sinks/generated/websocket.cue
+++ b/website/cue/reference/components/sinks/generated/websocket.cue
@@ -376,15 +376,6 @@ generated: components: sinks: websocket: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -548,324 +539,6 @@ generated: components: sinks: websocket: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""
diff --git a/website/cue/reference/components/sinks/generated/websocket_server.cue b/website/cue/reference/components/sinks/generated/websocket_server.cue
index eac5ac077c802..f822ab29b8626 100644
--- a/website/cue/reference/components/sinks/generated/websocket_server.cue
+++ b/website/cue/reference/components/sinks/generated/websocket_server.cue
@@ -253,15 +253,6 @@ generated: components: sinks: websocket_server: configuration: {
 
 						[otlp]: https://opentelemetry.io/docs/specs/otlp/
 						"""
-					parquet: """
-						Encodes events in [Apache Parquet][apache_parquet] columnar format.
-
-						Parquet is a columnar storage format optimized for analytics workloads.
-						It provides efficient compression and encoding schemes, making it ideal
-						for long-term storage and query performance.
-
-						[apache_parquet]: https://parquet.apache.org/
-						"""
 					protobuf: """
 						Encodes an event as a [Protobuf][protobuf] message.
 
@@ -425,324 +416,6 @@ generated: components: sinks: websocket_server: configuration: {
 				required:    false
 				type: array: items: type: string: {}
 			}
-			parquet: {
-				description:   "Apache Parquet-specific encoder options."
-				relevant_when: "codec = \"parquet\""
-				required:      true
-				type: object: options: {
-					allow_nullable_fields: {
-						description: """
-																Allow null values for non-nullable fields in the schema.
-
-																When enabled, missing or incompatible values will be encoded as null even for fields
-																marked as non-nullable in the Arrow schema. This is useful when working with downstream
-																systems that can handle null values through defaults, computed columns, or other mechanisms.
-
-																When disabled (default), missing values for non-nullable fields will cause encoding errors,
-																ensuring all required data is present before writing to Parquet.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					compression: {
-						description: """
-																Compression algorithm to use for Parquet columns
-
-																Compression is applied to all columns in the Parquet file.
-																Snappy provides a good balance of speed and compression ratio.
-																"""
-						required: false
-						type: string: {
-							default: "snappy"
-							enum: {
-								brotli:       "Brotli compression"
-								gzip:         "GZIP compression (slower, better compression ratio)"
-								lz4:          "LZ4 compression (very fast, moderate compression)"
-								snappy:       "Snappy compression (fast, moderate compression ratio)"
-								uncompressed: "No compression"
-								zstd:         "ZSTD compression (good balance of speed and compression)"
-							}
-							examples: ["snappy", "gzip", "zstd"]
-						}
-					}
-					compression_level: {
-						description: """
-																Compression level for algorithms that support it.
-
-																Only applies to ZSTD, GZIP, and Brotli compression. Ignored for other algorithms.
-
-																**ZSTD levels** (1-22):
-																- 1-3: Fastest, moderate compression (level 3 is default)
-																- 4-9: Good balance of speed and compression
-																- 10-15: Better compression, slower encoding
-																- 16-22: Maximum compression, slowest (good for cold storage)
-
-																**GZIP levels** (1-9):
-																- 1-3: Faster, less compression
-																- 6: Default balance (recommended)
-																- 9: Maximum compression, slowest
-
-																**Brotli levels** (0-11):
-																- 0-4: Faster encoding
-																- 1: Default (recommended)
-																- 5-11: Better compression, slower
-
-																Higher levels typically produce 20-50% smaller files but take 2-5x longer to encode.
-																Recommended: Use level 3-6 for hot data, 10-15 for cold storage.
-																"""
-						required: false
-						type: int: examples: [3, 6, 10]
-					}
-					exclude_columns: {
-						description: """
-																Column names to exclude from Parquet encoding
-
-																These columns will be completely excluded from the Parquet file.
-																Useful for filtering out metadata, internal fields, or temporary data.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: array: items: type: string: examples: ["vec![\"_metadata\".to_string(), \"internal_id\".to_string()]"]
-					}
-					infer_schema: {
-						description: """
-																Automatically infer schema from event data
-
-																When enabled, the schema is inferred from each batch of events independently.
-																The schema is determined by examining the types of values in the events.
-
-																**Type mapping:**
-																- String values → `utf8`
-																- Integer values → `int64`
-																- Float values → `float64`
-																- Boolean values → `boolean`
-																- Timestamp values → `timestamp_microsecond`
-																- Arrays/Objects → `utf8` (serialized as JSON)
-
-																**Type conflicts:** If a field has different types across events in the same batch,
-																it will be encoded as `utf8` (string) and all values will be converted to strings.
-
-																**Important:** Schema consistency across batches is the operator's responsibility.
-																Use VRL transforms to ensure consistent types if needed. Each batch may produce
-																a different schema if event structure varies.
-
-																**Bloom filters:** Not supported with inferred schemas. Use explicit schema for Bloom filters.
-
-																Mutually exclusive with `schema`. Must specify either `schema` or `infer_schema: true`.
-																"""
-						required: false
-						type: bool: {
-							default: false
-							examples: [true]
-						}
-					}
-					max_columns: {
-						description: """
-																Maximum number of columns to encode
-
-																Limits the number of columns in the Parquet file. Additional columns beyond
-																this limit will be silently dropped. Columns are selected in the order they
-																appear in the first event.
-
-																Only applies when `infer_schema` is enabled. Ignored when using explicit schema.
-																"""
-						required: false
-						type: uint: {
-							default: 1000
-							examples: [500, 1000]
-						}
-					}
-					row_group_size: {
-						description: """
-																Number of rows per row group
-
-																Row groups are Parquet's unit of parallelization. Larger row groups
-																can improve compression but increase memory usage during encoding.
-
-																Since each batch becomes a separate Parquet file, this value
-																should be <= the batch max_events setting. Row groups cannot span multiple files.
-																If not specified, defaults to the batch size.
-																"""
-						required: false
-						type: uint: examples: [100000, 1000000]
-					}
-					schema: {
-						description: """
-																The Arrow schema definition to use for encoding
-
-																This schema defines the structure and types of the Parquet file columns.
-																Specified as a map of field names to data types.
-
-																Mutually exclusive with `infer_schema`. Must specify either `schema` or `infer_schema: true`.
-
-																Supported types: utf8, int8, int16, int32, int64, uint8, uint16, uint32, uint64,
-																float32, float64, boolean, binary, timestamp_second, timestamp_millisecond,
-																timestamp_microsecond, timestamp_nanosecond, date32, date64, and more.
-																"""
-						required: false
-						type: object: {
-							examples: [{
-								id: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "int64"
-								}
-								name: {
-									bloom_filter:     true
-									bloom_filter_fpp: 0.01
-									bloom_filter_ndv: 1000000
-									type:             "utf8"
-								}
-								timestamp: {
-									bloom_filter:     false
-									bloom_filter_fpp: null
-									bloom_filter_ndv: null
-									type:             "timestamp_microsecond"
-								}
-							}]
-							options: "*": {
-								description: "A field definition specifying the data type and optional Bloom filter configuration."
-								required:    true
-								type: object: options: {
-									bloom_filter: {
-										description: """
-																								Enable Bloom filter for this specific column
-
-																								When enabled, a Bloom filter will be created for this column to improve
-																								query performance for point lookups and IN clauses. Only enable for
-																								high-cardinality columns (UUIDs, user IDs, etc.) to avoid overhead.
-																								"""
-										required: false
-										type: bool: {
-											default: false
-											examples: [true]
-										}
-									}
-									bloom_filter_fpp: {
-										description: """
-																								False positive probability for this column's Bloom filter
-
-																								Lower values create larger but more accurate filters.
-
-																								- 0.05 (5%): Good balance for general use
-																								- 0.01 (1%): Better for high-selectivity queries
-																								"""
-										required: false
-										type: float: examples: [0.05, 0.01]
-									}
-									bloom_filter_ndv: {
-										description: """
-																								Number of distinct values expected for this column's Bloom filter
-
-																								This controls the size of the Bloom filter. Should match the actual
-																								cardinality of the column. Will be automatically capped to the batch size.
-
-																								- Low cardinality (countries, states): 1,000 - 100,000
-																								- Medium cardinality (cities, products): 100,000 - 1,000,000
-																								- High cardinality (UUIDs, user IDs): 10,000,000+
-																								"""
-										required: false
-										type: uint: examples: [1000000, 10000000]
-									}
-									type: {
-										description: "Data type for this field"
-										required:    true
-										type: string: examples: ["utf8", "int64", "timestamp_ms"]
-									}
-								}
-							}
-						}
-					}
-					sorting_columns: {
-						description: """
-																Sorting order for rows within row groups.
-
-																Pre-sorting rows by specified columns before writing can significantly improve both
-																compression ratios and query performance. This is especially valuable for time-series
-																data and event logs.
-
-																**Benefits:**
-																- **Better compression** (20-40% smaller files): Similar values are grouped together
-																- **Faster queries**: More effective min/max statistics enable better row group skipping
-																- **Improved caching**: Query engines can more efficiently cache sorted data
-
-																**Common patterns:**
-																- Time-series: Sort by timestamp descending (most recent first)
-																- Multi-tenant: Sort by tenant_id, then timestamp
-																- User analytics: Sort by user_id, then event_time
-
-																**Trade-offs:**
-																- Adds sorting overhead during encoding (typically 10-30% slower writes)
-																- Requires buffering entire batch in memory for sorting
-																- Most beneficial when queries frequently filter on sorted columns
-
-																**Example:**
-																```yaml
-																sorting_columns:
-																  - column: timestamp
-																    descending: true
-																  - column: user_id
-																    descending: false
-																```
-
-																If not specified, rows are written in the order they appear in the batch.
-																"""
-						required: false
-						type: array: items: type: object: options: {
-							column: {
-								description: "Name of the column to sort by"
-								required:    true
-								type: string: examples: ["timestamp", "user_id"]
-							}
-							descending: {
-								description: """
-																							Sort in descending order (true) or ascending order (false)
-
-																							- `true`: Descending (Z-A, 9-0, newest-oldest)
-																							- `false`: Ascending (A-Z, 0-9, oldest-newest)
-																							"""
-								required: false
-								type: bool: {
-									default: false
-									examples: [true]
-								}
-							}
-						}
-					}
-					writer_version: {
-						description: """
-																Parquet format writer version.
-
-																Controls which Parquet format version to write:
-																- **v1** (PARQUET_1_0): Original format, maximum compatibility (default)
-																- **v2** (PARQUET_2_0): Modern format with improved encoding and statistics
-
-																Version 2 benefits:
-																- More efficient encoding for certain data types (10-20% smaller files)
-																- Better statistics for query optimization
-																- Improved data page format
-																- Required for some advanced features
-
-																Use v1 for maximum compatibility with older readers (pre-2018 tools).
-																Use v2 for better performance with modern query engines (Athena, Spark, Presto).
-																"""
-						required: false
-						type: string: {
-							default: "v2"
-							enum: {
-								v1: "Parquet format version 1.0 (maximum compatibility)"
-								v2: "Parquet format version 2.0 (modern format with better encoding)"
-							}
-							examples: ["v1", "v2"]
-						}
-					}
-				}
-			}
 			protobuf: {
 				description:   "Options for the Protobuf serializer."
 				relevant_when: "codec = \"protobuf\""

From 63fb18d9842479a8a29abcc2b70334794c759ac8 Mon Sep 17 00:00:00 2001
From: Rory Shanks <rorylshanks@gmail.com>
Date: Thu, 8 Jan 2026 10:38:39 +0100
Subject: [PATCH 12/13] updated LICENSE-3rdparty.csv

---
 LICENSE-3rdparty.csv | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv
index b507953d3272b..03f42b3b93069 100644
--- a/LICENSE-3rdparty.csv
+++ b/LICENSE-3rdparty.csv
@@ -396,6 +396,7 @@ inotify-sys,https://github.com/hannobraun/inotify-sys,ISC,Hanno Braun <hb@hannob
 inout,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers
 instability,https://github.com/ratatui-org/instability,MIT,"Stephen M. Coakley <me@stephencoakley.com>, Joshka"
 instant,https://github.com/sebcrozet/instant,BSD-3-Clause,sebcrozet <developer@crozet.re>
+integer-encoding,https://github.com/dermesser/integer-encoding-rs,MIT,Lewin Bormann <lbo@spheniscida.de>
 inventory,https://github.com/dtolnay/inventory,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
 io-lifetimes,https://github.com/sunfishcode/io-lifetimes,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Dan Gohman <dev@sunfishcode.online>
 io-uring,https://github.com/tokio-rs/io-uring,MIT OR Apache-2.0,quininer <quininer@live.com>
@@ -550,6 +551,7 @@ parking_lot,https://github.com/Amanieu/parking_lot,Apache-2.0 OR MIT,Amanieu d'A
 parking_lot,https://github.com/Amanieu/parking_lot,MIT OR Apache-2.0,Amanieu d'Antras <amanieu@gmail.com>
 parking_lot_core,https://github.com/Amanieu/parking_lot,Apache-2.0 OR MIT,Amanieu d'Antras <amanieu@gmail.com>
 parking_lot_core,https://github.com/Amanieu/parking_lot,MIT OR Apache-2.0,Amanieu d'Antras <amanieu@gmail.com>
+parquet,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
 parse-size,https://github.com/kennytm/parse-size,MIT,kennytm <kennytm@gmail.com>
 passt,https://github.com/kevingimbel/passt,MIT OR Apache-2.0,Kevin Gimbel <hallo@kevingimbel.com>
 paste,https://github.com/dtolnay/paste,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
@@ -688,6 +690,7 @@ secrecy,https://github.com/iqlusioninc/crates/tree/main/secrecy,Apache-2.0 OR MI
 security-framework,https://github.com/kornelski/rust-security-framework,MIT OR Apache-2.0,"Steven Fackler <sfackler@gmail.com>, Kornel <kornel@geekhood.net>"
 security-framework-sys,https://github.com/kornelski/rust-security-framework,MIT OR Apache-2.0,"Steven Fackler <sfackler@gmail.com>, Kornel <kornel@geekhood.net>"
 semver,https://github.com/dtolnay/semver,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
+seq-macro,https://github.com/dtolnay/seq-macro,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
 serde,https://github.com/serde-rs/serde,MIT OR Apache-2.0,"Erick Tryzelaar <erick.tryzelaar@gmail.com>, David Tolnay <dtolnay@gmail.com>"
 serde-aux,https://github.com/iddm/serde-aux,MIT,Victor Polevoy <maintainer@vpolevoy.com>
 serde-toml-merge,https://github.com/jdrouet/serde-toml-merge,MIT,Jeremie Drouet <jeremie.drouet@gmail.com>
@@ -770,6 +773,7 @@ terminal_size,https://github.com/eminence/terminal-size,MIT OR Apache-2.0,Andrew
 thiserror,https://github.com/dtolnay/thiserror,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
 thiserror-impl,https://github.com/dtolnay/thiserror,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
 thread_local,https://github.com/Amanieu/thread_local-rs,MIT OR Apache-2.0,Amanieu d'Antras <amanieu@gmail.com>
+thrift,https://github.com/apache/thrift/tree/master/lib/rs,Apache-2.0,Apache Thrift Developers <dev@thrift.apache.org>
 tikv-jemalloc-sys,https://github.com/tikv/jemallocator,MIT OR Apache-2.0,"Alex Crichton <alex@alexcrichton.com>, Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>, The TiKV Project Developers"
 tikv-jemallocator,https://github.com/tikv/jemallocator,MIT OR Apache-2.0,"Alex Crichton <alex@alexcrichton.com>, Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>, Simon Sapin <simon.sapin@exyr.org>, Steven Fackler <sfackler@gmail.com>, The TiKV Project Developers"
 time,https://github.com/time-rs/time,MIT OR Apache-2.0,"Jacob Pratt <open-source@jhpratt.dev>, Time contributors"

From 4d660af2936535c291f3e126cf3391002a1783eb Mon Sep 17 00:00:00 2001
From: Rory Shanks <rorylshanks@gmail.com>
Date: Thu, 8 Jan 2026 10:42:51 +0100
Subject: [PATCH 13/13] Fixed markdown checker

---
 changelog.d/parquet_encoder_aws_s3.feature.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/changelog.d/parquet_encoder_aws_s3.feature.md b/changelog.d/parquet_encoder_aws_s3.feature.md
index 6a2e921f0d595..e57c8d04f44da 100644
--- a/changelog.d/parquet_encoder_aws_s3.feature.md
+++ b/changelog.d/parquet_encoder_aws_s3.feature.md
@@ -7,11 +7,13 @@ and Presto. Users can now configure Parquet encoding with custom schemas defined
 as a simple map of field names to data types.
 
 Features include:
+
 - Support for all common data types: strings (utf8), integers (int8-int64), unsigned integers,
   floats (float32, float64), timestamps (second/millisecond/microsecond/nanosecond precision),
   booleans, binary data, and decimals
 - Configurable compression algorithms: snappy (default), gzip, zstd, lz4, brotli, or uncompressed
 
+
 Each batch of events becomes one Parquet file in S3, with batch size controlled by the standard
 `batch.max_events`, `batch.max_bytes`, and `batch.timeout_secs` settings.