diff --git a/changelog.d/24074_clickhouse_arrow_complex_types.enhancement.md b/changelog.d/24074_clickhouse_arrow_complex_types.enhancement.md new file mode 100644 index 0000000000000..5a5d6c65ec212 --- /dev/null +++ b/changelog.d/24074_clickhouse_arrow_complex_types.enhancement.md @@ -0,0 +1,3 @@ +The `clickhouse` sink now supports complex data types (Array, Map, and Tuple) when using the `arrow_stream` format. + +authors: benjamin-awd diff --git a/lib/codecs/src/encoding/format/arrow.rs b/lib/codecs/src/encoding/format/arrow.rs deleted file mode 100644 index 3c2d3863f1fb2..0000000000000 --- a/lib/codecs/src/encoding/format/arrow.rs +++ /dev/null @@ -1,1671 +0,0 @@ -//! Arrow IPC streaming format codec for batched event encoding -//! -//! Provides Apache Arrow IPC stream format encoding with static schema support. -//! This implements the streaming variant of the Arrow IPC protocol, which writes -//! a continuous stream of record batches without a file footer. - -use arrow::{ - array::{ - ArrayRef, BinaryBuilder, BooleanBuilder, Decimal128Builder, Decimal256Builder, - Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, - StringBuilder, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, - TimestampNanosecondBuilder, TimestampSecondBuilder, UInt8Builder, UInt16Builder, - UInt32Builder, UInt64Builder, - }, - datatypes::{DataType, Schema, TimeUnit, i256}, - ipc::writer::StreamWriter, - record_batch::RecordBatch, -}; -use async_trait::async_trait; -use bytes::{BufMut, Bytes, BytesMut}; -use chrono::{DateTime, Utc}; -use rust_decimal::Decimal; -use snafu::Snafu; -use std::sync::Arc; -use vector_config::configurable_component; - -use vector_core::event::{Event, Value}; - -/// Provides Arrow schema for encoding. -/// -/// Sinks can implement this trait to provide custom schema fetching logic. -#[async_trait] -pub trait SchemaProvider: Send + Sync + std::fmt::Debug { - /// Fetch the Arrow schema from the data store. - /// - /// This is called during sink configuration build phase to fetch - /// the schema once at startup, rather than at runtime. - async fn get_schema(&self) -> Result; -} - -/// Configuration for Arrow IPC stream serialization -#[configurable_component] -#[derive(Clone, Default)] -pub struct ArrowStreamSerializerConfig { - /// The Arrow schema to use for encoding - #[serde(skip)] - #[configurable(derived)] - pub schema: Option, - - /// Allow null values for non-nullable fields in the schema. - /// - /// When enabled, missing or incompatible values will be encoded as null even for fields - /// marked as non-nullable in the Arrow schema. This is useful when working with downstream - /// systems that can handle null values through defaults, computed columns, or other mechanisms. - /// - /// When disabled (default), missing values for non-nullable fields will cause encoding errors, - /// ensuring all required data is present before sending to the sink. - #[serde(default)] - #[configurable(derived)] - pub allow_nullable_fields: bool, -} - -impl std::fmt::Debug for ArrowStreamSerializerConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ArrowStreamSerializerConfig") - .field( - "schema", - &self - .schema - .as_ref() - .map(|s| format!("{} fields", s.fields().len())), - ) - .field("allow_nullable_fields", &self.allow_nullable_fields) - .finish() - } -} - -impl ArrowStreamSerializerConfig { - /// Create a new ArrowStreamSerializerConfig with a schema - pub fn new(schema: arrow::datatypes::Schema) -> Self { - Self { - schema: Some(schema), - allow_nullable_fields: false, - } - } - - /// The data type of events that are accepted by `ArrowStreamEncoder`. - pub fn input_type(&self) -> vector_core::config::DataType { - vector_core::config::DataType::Log - } - - /// The schema required by the serializer. - pub fn schema_requirement(&self) -> vector_core::schema::Requirement { - vector_core::schema::Requirement::empty() - } -} - -/// Arrow IPC stream batch serializer that holds the schema -#[derive(Clone, Debug)] -pub struct ArrowStreamSerializer { - schema: Arc, -} - -impl ArrowStreamSerializer { - /// Create a new ArrowStreamSerializer with the given configuration - pub fn new(config: ArrowStreamSerializerConfig) -> Result { - let schema = config - .schema - .ok_or_else(|| vector_common::Error::from("Arrow serializer requires a schema."))?; - - // If allow_nullable_fields is enabled, transform the schema once here - // instead of on every batch encoding - let schema = if config.allow_nullable_fields { - Schema::new_with_metadata( - schema - .fields() - .iter() - .map(|f| Arc::new(make_field_nullable(f))) - .collect::>(), - schema.metadata().clone(), - ) - } else { - schema - }; - - Ok(Self { - schema: Arc::new(schema), - }) - } -} - -impl tokio_util::codec::Encoder> for ArrowStreamSerializer { - type Error = ArrowEncodingError; - - fn encode(&mut self, events: Vec, buffer: &mut BytesMut) -> Result<(), Self::Error> { - if events.is_empty() { - return Err(ArrowEncodingError::NoEvents); - } - - let bytes = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&self.schema)))?; - - buffer.extend_from_slice(&bytes); - Ok(()) - } -} - -/// Errors that can occur during Arrow encoding -#[derive(Debug, Snafu)] -pub enum ArrowEncodingError { - /// Failed to create Arrow record batch - #[snafu(display("Failed to create Arrow record batch: {}", source))] - RecordBatchCreation { - /// The underlying Arrow error - source: arrow::error::ArrowError, - }, - - /// Failed to write Arrow IPC data - #[snafu(display("Failed to write Arrow IPC data: {}", source))] - IpcWrite { - /// The underlying Arrow error - source: arrow::error::ArrowError, - }, - - /// No events provided for encoding - #[snafu(display("No events provided for encoding"))] - NoEvents, - - /// Schema must be provided before encoding - #[snafu(display("Schema must be provided before encoding"))] - NoSchemaProvided, - - /// Failed to fetch schema from provider - #[snafu(display("Failed to fetch schema from provider: {}", message))] - SchemaFetchError { - /// Error message from the provider - message: String, - }, - - /// Unsupported Arrow data type for field - #[snafu(display( - "Unsupported Arrow data type for field '{}': {:?}", - field_name, - data_type - ))] - UnsupportedType { - /// The field name - field_name: String, - /// The unsupported data type - data_type: DataType, - }, - - /// Null value encountered for non-nullable field - #[snafu(display("Null value for non-nullable field '{}'", field_name))] - NullConstraint { - /// The field name - field_name: String, - }, - - /// IO error during encoding - #[snafu(display("IO error: {}", source))] - Io { - /// The underlying IO error - source: std::io::Error, - }, -} - -impl From for ArrowEncodingError { - fn from(error: std::io::Error) -> Self { - Self::Io { source: error } - } -} - -/// Encodes a batch of events into Arrow IPC streaming format -pub fn encode_events_to_arrow_ipc_stream( - events: &[Event], - schema: Option>, -) -> Result { - if events.is_empty() { - return Err(ArrowEncodingError::NoEvents); - } - - let schema_ref = schema.ok_or(ArrowEncodingError::NoSchemaProvided)?; - - let record_batch = build_record_batch(schema_ref, events)?; - - let ipc_err = |source| ArrowEncodingError::IpcWrite { source }; - - let mut buffer = BytesMut::new().writer(); - let mut writer = - StreamWriter::try_new(&mut buffer, record_batch.schema_ref()).map_err(ipc_err)?; - writer.write(&record_batch).map_err(ipc_err)?; - writer.finish().map_err(ipc_err)?; - - Ok(buffer.into_inner().freeze()) -} - -/// Recursively makes a Field and all its nested fields nullable -fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field { - let new_data_type = match field.data_type() { - DataType::List(inner_field) => DataType::List(Arc::new(make_field_nullable(inner_field))), - DataType::Struct(fields) => { - DataType::Struct(fields.iter().map(|f| make_field_nullable(f)).collect()) - } - DataType::Map(inner_field, sorted) => { - DataType::Map(Arc::new(make_field_nullable(inner_field)), *sorted) - } - other => other.clone(), - }; - - field - .clone() - .with_data_type(new_data_type) - .with_nullable(true) -} - -/// Builds an Arrow RecordBatch from events -fn build_record_batch( - schema: Arc, - events: &[Event], -) -> Result { - let num_fields = schema.fields().len(); - let mut columns: Vec = Vec::with_capacity(num_fields); - - for field in schema.fields() { - let field_name = field.name(); - let nullable = field.is_nullable(); - let array: ArrayRef = match field.data_type() { - DataType::Timestamp(time_unit, _) => { - build_timestamp_array(events, field_name, *time_unit, nullable)? - } - DataType::Utf8 => build_string_array(events, field_name, nullable)?, - DataType::Int8 => build_int8_array(events, field_name, nullable)?, - DataType::Int16 => build_int16_array(events, field_name, nullable)?, - DataType::Int32 => build_int32_array(events, field_name, nullable)?, - DataType::Int64 => build_int64_array(events, field_name, nullable)?, - DataType::UInt8 => build_uint8_array(events, field_name, nullable)?, - DataType::UInt16 => build_uint16_array(events, field_name, nullable)?, - DataType::UInt32 => build_uint32_array(events, field_name, nullable)?, - DataType::UInt64 => build_uint64_array(events, field_name, nullable)?, - DataType::Float32 => build_float32_array(events, field_name, nullable)?, - DataType::Float64 => build_float64_array(events, field_name, nullable)?, - DataType::Boolean => build_boolean_array(events, field_name, nullable)?, - DataType::Binary => build_binary_array(events, field_name, nullable)?, - DataType::Decimal128(precision, scale) => { - build_decimal128_array(events, field_name, *precision, *scale, nullable)? - } - DataType::Decimal256(precision, scale) => { - build_decimal256_array(events, field_name, *precision, *scale, nullable)? - } - other_type => { - return Err(ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: other_type.clone(), - }); - } - }; - - columns.push(array); - } - - RecordBatch::try_new(schema, columns) - .map_err(|source| ArrowEncodingError::RecordBatchCreation { source }) -} - -/// Macro to handle appending null or returning an error for non-nullable fields. -macro_rules! handle_null_constraints { - ($builder:expr, $nullable:expr, $field_name:expr) => {{ - if !$nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: $field_name.into(), - }); - } - $builder.append_null(); - }}; -} - -/// Macro to generate a `build_*_array` function for primitive types. -macro_rules! define_build_primitive_array_fn { - ( - $fn_name:ident, // The function name (e.g., build_int8_array) - $builder_ty:ty, // The builder type (e.g., Int8Builder) - // One or more match arms for valid Value types - $( $value_pat:pat $(if $guard:expr)? => $append_expr:expr ),+ - ) => { - fn $fn_name( - events: &[Event], - field_name: &str, - nullable: bool, - ) -> Result { - let mut builder = <$builder_ty>::with_capacity(events.len()); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - $( - $value_pat $(if $guard)? => builder.append_value($append_expr), - )+ - // All other patterns are treated as null/invalid - _ => handle_null_constraints!(builder, nullable, field_name), - } - } - } - Ok(Arc::new(builder.finish())) - } - }; -} - -fn extract_timestamp(value: &Value) -> Option> { - match value { - Value::Timestamp(ts) => Some(*ts), - Value::Bytes(bytes) => std::str::from_utf8(bytes) - .ok() - .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok()) - .map(|dt| dt.with_timezone(&Utc)), - _ => None, - } -} - -fn build_timestamp_array( - events: &[Event], - field_name: &str, - time_unit: TimeUnit, - nullable: bool, -) -> Result { - macro_rules! build_array { - ($builder:ty, $converter:expr) => {{ - let mut builder = <$builder>::with_capacity(events.len()); - for event in events { - if let Event::Log(log) = event { - let value_to_append = log.get(field_name).and_then(|value| { - // First, try to extract it as a native or string timestamp - if let Some(ts) = extract_timestamp(value) { - $converter(&ts) - } - // Else, fall back to a raw integer - else if let Value::Integer(i) = value { - Some(*i) - } - // Else, it's an unsupported type (e.g., Bool, Float) - else { - None - } - }); - - if value_to_append.is_none() && !nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: field_name.into(), - }); - } - - builder.append_option(value_to_append); - } - } - Ok(Arc::new(builder.finish())) - }}; - } - - match time_unit { - TimeUnit::Second => { - build_array!(TimestampSecondBuilder, |ts: &DateTime| Some( - ts.timestamp() - )) - } - TimeUnit::Millisecond => { - build_array!(TimestampMillisecondBuilder, |ts: &DateTime| Some( - ts.timestamp_millis() - )) - } - TimeUnit::Microsecond => { - build_array!(TimestampMicrosecondBuilder, |ts: &DateTime| Some( - ts.timestamp_micros() - )) - } - TimeUnit::Nanosecond => { - build_array!(TimestampNanosecondBuilder, |ts: &DateTime| ts - .timestamp_nanos_opt()) - } - } -} - -fn build_string_array( - events: &[Event], - field_name: &str, - nullable: bool, -) -> Result { - let mut builder = StringBuilder::with_capacity(events.len(), 0); - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - if let Some(value) = log.get(field_name) { - match value { - Value::Bytes(bytes) => { - // Attempt direct UTF-8 conversion first, fallback to lossy - match std::str::from_utf8(bytes) { - Ok(s) => builder.append_value(s), - Err(_) => builder.append_value(&String::from_utf8_lossy(bytes)), - } - appended = true; - } - Value::Object(obj) => { - if let Ok(s) = serde_json::to_string(&obj) { - builder.append_value(s); - appended = true; - } - } - Value::Array(arr) => { - if let Ok(s) = serde_json::to_string(&arr) { - builder.append_value(s); - appended = true; - } - } - _ => { - builder.append_value(&value.to_string_lossy()); - appended = true; - } - } - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -define_build_primitive_array_fn!( - build_int8_array, - Int8Builder, - Some(Value::Integer(i)) if *i >= i8::MIN as i64 && *i <= i8::MAX as i64 => *i as i8 -); - -define_build_primitive_array_fn!( - build_int16_array, - Int16Builder, - Some(Value::Integer(i)) if *i >= i16::MIN as i64 && *i <= i16::MAX as i64 => *i as i16 -); - -define_build_primitive_array_fn!( - build_int32_array, - Int32Builder, - Some(Value::Integer(i)) if *i >= i32::MIN as i64 && *i <= i32::MAX as i64 => *i as i32 -); - -define_build_primitive_array_fn!( - build_int64_array, - Int64Builder, - Some(Value::Integer(i)) => *i -); - -define_build_primitive_array_fn!( - build_uint8_array, - UInt8Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u8::MAX as i64 => *i as u8 -); - -define_build_primitive_array_fn!( - build_uint16_array, - UInt16Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u16::MAX as i64 => *i as u16 -); - -define_build_primitive_array_fn!( - build_uint32_array, - UInt32Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u32::MAX as i64 => *i as u32 -); - -define_build_primitive_array_fn!( - build_uint64_array, - UInt64Builder, - Some(Value::Integer(i)) if *i >= 0 => *i as u64 -); - -define_build_primitive_array_fn!( - build_float32_array, - Float32Builder, - Some(Value::Float(f)) => f.into_inner() as f32, - Some(Value::Integer(i)) => *i as f32 -); - -define_build_primitive_array_fn!( - build_float64_array, - Float64Builder, - Some(Value::Float(f)) => f.into_inner(), - Some(Value::Integer(i)) => *i as f64 -); - -define_build_primitive_array_fn!( - build_boolean_array, - BooleanBuilder, - Some(Value::Boolean(b)) => *b -); - -fn build_binary_array( - events: &[Event], - field_name: &str, - nullable: bool, -) -> Result { - let mut builder = BinaryBuilder::with_capacity(events.len(), 0); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - Some(Value::Bytes(bytes)) => builder.append_value(bytes), - _ => handle_null_constraints!(builder, nullable, field_name), - } - } - } - - Ok(Arc::new(builder.finish())) -} - -fn build_decimal128_array( - events: &[Event], - field_name: &str, - precision: u8, - scale: i8, - nullable: bool, -) -> Result { - let mut builder = Decimal128Builder::with_capacity(events.len()) - .with_precision_and_scale(precision, scale) - .map_err(|_| ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: DataType::Decimal128(precision, scale), - })?; - - let target_scale = scale.unsigned_abs() as u32; - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - match log.get(field_name) { - Some(Value::Float(f)) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(mantissa); - appended = true; - } - } - Some(Value::Integer(i)) => { - let mut decimal = Decimal::from(*i); - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(mantissa); - appended = true; - } - _ => {} - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -fn build_decimal256_array( - events: &[Event], - field_name: &str, - precision: u8, - scale: i8, - nullable: bool, -) -> Result { - let mut builder = Decimal256Builder::with_capacity(events.len()) - .with_precision_and_scale(precision, scale) - .map_err(|_| ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: DataType::Decimal256(precision, scale), - })?; - - let target_scale = scale.unsigned_abs() as u32; - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - match log.get(field_name) { - Some(Value::Float(f)) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - // rust_decimal does not support i256 natively so we upcast here - builder.append_value(i256::from_i128(mantissa)); - appended = true; - } - } - Some(Value::Integer(i)) => { - let mut decimal = Decimal::from(*i); - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(i256::from_i128(mantissa)); - appended = true; - } - _ => {} - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::{ - array::{ - Array, BinaryArray, BooleanArray, Float64Array, Int64Array, StringArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, - }, - datatypes::Field, - ipc::reader::StreamReader, - }; - use chrono::Utc; - use std::io::Cursor; - use vector_core::event::LogEvent; - - #[test] - fn test_encode_all_types() { - let mut log = LogEvent::default(); - log.insert("string_field", "test"); - log.insert("int8_field", 127); - log.insert("int16_field", 32000); - log.insert("int32_field", 1000000); - log.insert("int64_field", 42); - log.insert("float32_field", 3.15); - log.insert("float64_field", 3.15); - log.insert("bool_field", true); - log.insert("bytes_field", bytes::Bytes::from("binary")); - log.insert("timestamp_field", Utc::now()); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("string_field", DataType::Utf8, true), - Field::new("int8_field", DataType::Int8, true), - Field::new("int16_field", DataType::Int16, true), - Field::new("int32_field", DataType::Int32, true), - Field::new("int64_field", DataType::Int64, true), - Field::new("float32_field", DataType::Float32, true), - Field::new("float64_field", DataType::Float64, true), - Field::new("bool_field", DataType::Boolean, true), - Field::new("bytes_field", DataType::Binary, true), - Field::new( - "timestamp_field", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 10); - - // Verify string field - assert_eq!( - batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - "test" - ); - - // Verify int8 field - assert_eq!( - batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 127 - ); - - // Verify int16 field - assert_eq!( - batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 32000 - ); - - // Verify int32 field - assert_eq!( - batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 1000000 - ); - - // Verify int64 field - assert_eq!( - batch - .column(4) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 42 - ); - - // Verify float32 field - assert!( - (batch - .column(5) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - - 3.15) - .abs() - < 0.001 - ); - - // Verify float64 field - assert!( - (batch - .column(6) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - - 3.15) - .abs() - < 0.001 - ); - - // Verify boolean field - assert!( - batch - .column(7) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - "{}", - true - ); - - // Verify binary field - assert_eq!( - batch - .column(8) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - b"binary" - ); - - // Verify timestamp field - assert!( - !batch - .column(9) - .as_any() - .downcast_ref::() - .unwrap() - .is_null(0) - ); - } - - #[test] - fn test_encode_null_values() { - let mut log1 = LogEvent::default(); - log1.insert("field_a", 1); - // field_b is missing - - let mut log2 = LogEvent::default(); - log2.insert("field_b", 2); - // field_a is missing - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("field_a", DataType::Int64, true), - Field::new("field_b", DataType::Int64, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 2); - - let field_a = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(field_a.value(0), 1); - assert!(field_a.is_null(1)); - - let field_b = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(field_b.is_null(0)); - assert_eq!(field_b.value(1), 2); - } - - #[test] - fn test_encode_type_mismatches() { - let mut log1 = LogEvent::default(); - log1.insert("field", 42); // Integer - - let mut log2 = LogEvent::default(); - log2.insert("field", 3.15); // Float - type mismatch! - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - // Schema expects Int64 - let schema = Arc::new(Schema::new(vec![Field::new( - "field", - DataType::Int64, - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 2); - - let field_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(field_array.value(0), 42); - assert!(field_array.is_null(1)); // Type mismatch becomes null - } - - #[test] - fn test_encode_complex_json_values() { - use serde_json::json; - - let mut log = LogEvent::default(); - log.insert( - "object_field", - json!({"key": "value", "nested": {"count": 42}}), - ); - log.insert("array_field", json!([1, 2, 3])); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("object_field", DataType::Utf8, true), - Field::new("array_field", DataType::Utf8, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - - let object_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let object_str = object_array.value(0); - assert!(object_str.contains("key")); - assert!(object_str.contains("value")); - - let array_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - let array_str = array_array.value(0); - assert_eq!(array_str, "[1,2,3]"); - } - - #[test] - fn test_encode_unsupported_type() { - let mut log = LogEvent::default(); - log.insert("field", "value"); - - let events = vec![Event::Log(log)]; - - // Use an unsupported type - let schema = Arc::new(Schema::new(vec![Field::new( - "field", - DataType::Duration(TimeUnit::Millisecond), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - ArrowEncodingError::UnsupportedType { .. } - )); - } - - #[test] - fn test_encode_without_schema_fails() { - let mut log1 = LogEvent::default(); - log1.insert("message", "hello"); - - let events = vec![Event::Log(log1)]; - - let result = encode_events_to_arrow_ipc_stream(&events, None); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - ArrowEncodingError::NoSchemaProvided - )); - } - - #[test] - fn test_encode_empty_events() { - let events: Vec = vec![]; - let result = encode_events_to_arrow_ipc_stream(&events, None); - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), ArrowEncodingError::NoEvents)); - } - - #[test] - fn test_encode_timestamp_precisions() { - let now = Utc::now(); - let mut log = LogEvent::default(); - log.insert("ts_second", now); - log.insert("ts_milli", now); - log.insert("ts_micro", now); - log.insert("ts_nano", now); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new( - "ts_second", - DataType::Timestamp(TimeUnit::Second, None), - true, - ), - Field::new( - "ts_milli", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new( - "ts_micro", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - ), - Field::new( - "ts_nano", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 4); - - let ts_second = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_second.is_null(0)); - assert_eq!(ts_second.value(0), now.timestamp()); - - let ts_milli = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_milli.is_null(0)); - assert_eq!(ts_milli.value(0), now.timestamp_millis()); - - let ts_micro = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_micro.is_null(0)); - assert_eq!(ts_micro.value(0), now.timestamp_micros()); - - let ts_nano = batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_nano.is_null(0)); - assert_eq!(ts_nano.value(0), now.timestamp_nanos_opt().unwrap()); - } - - #[test] - fn test_encode_mixed_timestamp_string_and_native() { - // Test mixing string timestamps with native Timestamp values - let mut log1 = LogEvent::default(); - log1.insert("ts", "2025-10-22T10:18:44.256Z"); // String - - let mut log2 = LogEvent::default(); - log2.insert("ts", Utc::now()); // Native Timestamp - - let mut log3 = LogEvent::default(); - log3.insert("ts", 1729594724256000000_i64); // Integer (nanoseconds) - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "ts", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let ts_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // All three should be non-null - assert!(!ts_array.is_null(0)); - assert!(!ts_array.is_null(1)); - assert!(!ts_array.is_null(2)); - - // First one should match the parsed string - let expected = chrono::DateTime::parse_from_rfc3339("2025-10-22T10:18:44.256Z") - .unwrap() - .timestamp_nanos_opt() - .unwrap(); - assert_eq!(ts_array.value(0), expected); - - // Third one should match the integer - assert_eq!(ts_array.value(2), 1729594724256000000_i64); - } - - #[test] - fn test_encode_invalid_string_timestamp() { - // Test that invalid timestamp strings become null - let mut log1 = LogEvent::default(); - log1.insert("timestamp", "not-a-timestamp"); - - let mut log2 = LogEvent::default(); - log2.insert("timestamp", "2025-10-22T10:18:44.256Z"); // Valid - - let mut log3 = LogEvent::default(); - log3.insert("timestamp", "2025-99-99T99:99:99Z"); // Invalid - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "timestamp", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let ts_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // Invalid timestamps should be null - assert!(ts_array.is_null(0)); - assert!(!ts_array.is_null(1)); // Valid one - assert!(ts_array.is_null(2)); - } - - #[test] - fn test_encode_decimal128_from_integer() { - use arrow::array::Decimal128Array; - - let mut log = LogEvent::default(); - // Store quantity as integer: 1000 - log.insert("quantity", 1000_i64); - - let events = vec![Event::Log(log)]; - - // Decimal(10, 3) - will represent 1000 as 1000.000 - let schema = Arc::new(Schema::new(vec![Field::new( - "quantity", - DataType::Decimal128(10, 3), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert!(!decimal_array.is_null(0)); - // 1000 with scale 3 = 1000 * 10^3 = 1000000 - assert_eq!(decimal_array.value(0), 1000000_i128); - } - - #[test] - fn test_encode_decimal256() { - use arrow::array::Decimal256Array; - - let mut log = LogEvent::default(); - // Very large precision number - log.insert("big_value", 123456789.123456_f64); - - let events = vec![Event::Log(log)]; - - // Decimal256(50, 6) - high precision decimal - let schema = Arc::new(Schema::new(vec![Field::new( - "big_value", - DataType::Decimal256(50, 6), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert!(!decimal_array.is_null(0)); - // Value should be non-null and encoded - let value = decimal_array.value(0); - assert!(value.to_i128().is_some()); - } - - #[test] - fn test_encode_decimal_null_values() { - use arrow::array::Decimal128Array; - - let mut log1 = LogEvent::default(); - log1.insert("price", 99.99_f64); - - let log2 = LogEvent::default(); - // No price field - should be null - - let mut log3 = LogEvent::default(); - log3.insert("price", 50.00_f64); - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "price", - DataType::Decimal128(10, 2), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // First row: 99.99 - assert!(!decimal_array.is_null(0)); - assert_eq!(decimal_array.value(0), 9999_i128); - - // Second row: null - assert!(decimal_array.is_null(1)); - - // Third row: 50.00 - assert!(!decimal_array.is_null(2)); - assert_eq!(decimal_array.value(2), 5000_i128); - } - - #[test] - fn test_encode_unsigned_integer_types() { - use arrow::array::{UInt8Array, UInt16Array, UInt32Array, UInt64Array}; - - let mut log = LogEvent::default(); - log.insert("uint8_field", 255_i64); - log.insert("uint16_field", 65535_i64); - log.insert("uint32_field", 4294967295_i64); - log.insert("uint64_field", 9223372036854775807_i64); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("uint8_field", DataType::UInt8, true), - Field::new("uint16_field", DataType::UInt16, true), - Field::new("uint32_field", DataType::UInt32, true), - Field::new("uint64_field", DataType::UInt64, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 4); - - // Verify uint8 - let uint8_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint8_array.value(0), 255_u8); - - // Verify uint16 - let uint16_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint16_array.value(0), 65535_u16); - - // Verify uint32 - let uint32_array = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint32_array.value(0), 4294967295_u32); - - // Verify uint64 - let uint64_array = batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint64_array.value(0), 9223372036854775807_u64); - } - - #[test] - fn test_encode_unsigned_integers_with_null_and_overflow() { - use arrow::array::{UInt8Array, UInt32Array}; - - let mut log1 = LogEvent::default(); - log1.insert("uint8_field", 100_i64); - log1.insert("uint32_field", 1000_i64); - - let mut log2 = LogEvent::default(); - log2.insert("uint8_field", 300_i64); // Overflow - should be null - log2.insert("uint32_field", -1_i64); // Negative - should be null - - let log3 = LogEvent::default(); - // Missing fields - should be null - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("uint8_field", DataType::UInt8, true), - Field::new("uint32_field", DataType::UInt32, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - // Check uint8 column - let uint8_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint8_array.value(0), 100_u8); // Valid - assert!(uint8_array.is_null(1)); // Overflow - assert!(uint8_array.is_null(2)); // Missing - - // Check uint32 column - let uint32_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint32_array.value(0), 1000_u32); // Valid - assert!(uint32_array.is_null(1)); // Negative - assert!(uint32_array.is_null(2)); // Missing - } - - #[test] - fn test_encode_non_nullable_field_with_null_value() { - // Test that encoding fails when a non-nullable field encounters a null value - let mut log1 = LogEvent::default(); - log1.insert("required_field", 42); - - let log2 = LogEvent::default(); - // log2 is missing required_field - should cause an error - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - // Create schema with non-nullable field - let schema = Arc::new(Schema::new(vec![Field::new( - "required_field", - DataType::Int64, - false, // Not nullable - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - - match result.unwrap_err() { - ArrowEncodingError::NullConstraint { field_name } => { - assert_eq!(field_name, "required_field"); - } - other => panic!("Expected NullConstraint error, got: {:?}", other), - } - } - - #[test] - fn test_encode_non_nullable_string_field_with_missing_value() { - // Test that encoding fails for non-nullable string field - let mut log1 = LogEvent::default(); - log1.insert("name", "Alice"); - - let mut log2 = LogEvent::default(); - log2.insert("name", "Bob"); - - let log3 = LogEvent::default(); - // log3 is missing name field - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "name", - DataType::Utf8, - false, // Not nullable - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - - match result.unwrap_err() { - ArrowEncodingError::NullConstraint { field_name } => { - assert_eq!(field_name, "name"); - } - other => panic!("Expected NullConstraint error, got: {:?}", other), - } - } - - #[test] - fn test_encode_non_nullable_field_all_values_present() { - // Test that encoding succeeds when all values are present for non-nullable field - let mut log1 = LogEvent::default(); - log1.insert("id", 1); - - let mut log2 = LogEvent::default(); - log2.insert("id", 2); - - let mut log3 = LogEvent::default(); - log3.insert("id", 3); - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "id", - DataType::Int64, - false, // Not nullable - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let id_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(id_array.value(0), 1); - assert_eq!(id_array.value(1), 2); - assert_eq!(id_array.value(2), 3); - assert!(!id_array.is_null(0)); - assert!(!id_array.is_null(1)); - assert!(!id_array.is_null(2)); - } - - #[test] - fn test_config_allow_nullable_fields_overrides_schema() { - use tokio_util::codec::Encoder; - - // Create events: One valid, one missing the "required" field - let mut log1 = LogEvent::default(); - log1.insert("strict_field", 42); - let log2 = LogEvent::default(); - let events = vec![Event::Log(log1), Event::Log(log2)]; - - let schema = Schema::new(vec![Field::new("strict_field", DataType::Int64, false)]); - - let mut config = ArrowStreamSerializerConfig::new(schema); - config.allow_nullable_fields = true; - - let mut serializer = - ArrowStreamSerializer::new(config).expect("Failed to create serializer"); - - let mut buffer = BytesMut::new(); - serializer - .encode(events, &mut buffer) - .expect("Encoding should succeed when allow_nullable_fields is true"); - - let cursor = Cursor::new(buffer); - let mut reader = StreamReader::try_new(cursor, None).expect("Failed to create reader"); - let batch = reader.next().unwrap().expect("Failed to read batch"); - - assert_eq!(batch.num_rows(), 2); - - let binding = batch.schema(); - let output_field = binding.field(0); - assert!( - output_field.is_nullable(), - "The output schema field should have been transformed to nullable=true" - ); - - let array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(array.value(0), 42); - assert!(!array.is_null(0)); - assert!( - array.is_null(1), - "The missing value should be encoded as null" - ); - } - - #[test] - fn test_make_field_nullable_with_nested_types() { - // Test that make_field_nullable recursively handles List and Struct types - - // Create a nested structure: Struct containing a List of Structs - // struct { inner_list: [{ nested_field: Int64 }] } - let inner_struct_field = Field::new("nested_field", DataType::Int64, false); - let inner_struct = - DataType::Struct(arrow::datatypes::Fields::from(vec![inner_struct_field])); - let list_field = Field::new("item", inner_struct, false); - let list_type = DataType::List(Arc::new(list_field)); - let outer_field = Field::new("inner_list", list_type, false); - let outer_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![outer_field])); - - let original_field = Field::new("root", outer_struct, false); - - // Apply make_field_nullable - let nullable_field = make_field_nullable(&original_field); - - // Verify root field is nullable - assert!( - nullable_field.is_nullable(), - "Root field should be nullable" - ); - - // Verify nested struct is nullable - if let DataType::Struct(root_fields) = nullable_field.data_type() { - let inner_list_field = &root_fields[0]; - assert!( - inner_list_field.is_nullable(), - "inner_list field should be nullable" - ); - - // Verify list element is nullable - if let DataType::List(list_item_field) = inner_list_field.data_type() { - assert!( - list_item_field.is_nullable(), - "List item field should be nullable" - ); - - // Verify inner struct fields are nullable - if let DataType::Struct(inner_struct_fields) = list_item_field.data_type() { - let nested_field = &inner_struct_fields[0]; - assert!( - nested_field.is_nullable(), - "nested_field should be nullable" - ); - } else { - panic!("Expected Struct type for list items"); - } - } else { - panic!("Expected List type for inner_list"); - } - } else { - panic!("Expected Struct type for root field"); - } - } - - #[test] - fn test_make_field_nullable_with_map_type() { - // Test that make_field_nullable handles Map types - // Map is internally represented as List> - - // Create a map: Map - // Internally: List> - let key_field = Field::new("key", DataType::Utf8, false); - let value_field = Field::new("value", DataType::Int64, false); - let entries_struct = - DataType::Struct(arrow::datatypes::Fields::from(vec![key_field, value_field])); - let entries_field = Field::new("entries", entries_struct, false); - let map_type = DataType::Map(Arc::new(entries_field), false); - - let original_field = Field::new("my_map", map_type, false); - - // Apply make_field_nullable - let nullable_field = make_field_nullable(&original_field); - - // Verify root field is nullable - assert!( - nullable_field.is_nullable(), - "Root map field should be nullable" - ); - - // Verify map entries are nullable - if let DataType::Map(entries_field, _sorted) = nullable_field.data_type() { - assert!( - entries_field.is_nullable(), - "Map entries field should be nullable" - ); - - // Verify the struct inside the map is nullable - if let DataType::Struct(struct_fields) = entries_field.data_type() { - let key_field = &struct_fields[0]; - let value_field = &struct_fields[1]; - assert!(key_field.is_nullable(), "Map key field should be nullable"); - assert!( - value_field.is_nullable(), - "Map value field should be nullable" - ); - } else { - panic!("Expected Struct type for map entries"); - } - } else { - panic!("Expected Map type for my_map field"); - } - } -} diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs new file mode 100644 index 0000000000000..240edb9ec7a1c --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -0,0 +1,406 @@ +//! Arrow record batch builder +//! +//! Builds Arrow RecordBatches from Vector events by creating appropriate +//! array builders and appending values according to the schema. + +use arrow::{ + array::{ + ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Decimal128Builder, + Decimal256Builder, Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, + Int64Builder, ListBuilder, MapBuilder, StringBuilder, StructBuilder, + TimestampMicrosecondBuilder, TimestampMillisecondBuilder, TimestampNanosecondBuilder, + TimestampSecondBuilder, UInt8Builder, UInt16Builder, UInt32Builder, UInt64Builder, + }, + datatypes::{DataType, Field, SchemaRef, TimeUnit, i256}, + record_batch::RecordBatch, +}; +use vector_core::event::{Event, Value}; + +use super::{ArrowEncodingError, types::create_array_builder_for_type}; + +/// Checks if a data type is supported by the Arrow encoder. +fn is_supported_type(data_type: &DataType) -> bool { + matches!( + data_type, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + | DataType::Boolean + | DataType::Utf8 + | DataType::Binary + | DataType::Timestamp(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) + | DataType::List(_) + | DataType::Struct(_) + | DataType::Map(_, _) + ) +} + +/// Helper macro for downcasting builders +macro_rules! downcast_builder { + // Infallible version - used for non-complex types + ($builder:expr, $builder_type:ty) => { + $builder + .as_any_mut() + .downcast_mut::<$builder_type>() + .expect(concat!( + "Failed to downcast builder to ", + stringify!($builder_type) + )) + }; + + // Fallible version - used for complex types (returns Result for error handling) + ($builder:expr, $builder_type:ty, $field:expr) => { + $builder + .as_any_mut() + .downcast_mut::<$builder_type>() + .ok_or_else(|| ArrowEncodingError::UnsupportedType { + field_name: $field.name().clone(), + data_type: $field.data_type().clone(), + }) + }; +} + +/// Macro to simplify appending null values by generating match arms +macro_rules! append_null_match { + ($builder:expr, $data_type:expr, {$($pattern:pat => $builder_type:ty),* $(,)?}) => { + match $data_type { + $($pattern => downcast_builder!($builder, $builder_type).append_null(),)* + _ => {} + } + }; +} + +/// Helper function to serialize a Value to JSON string. +/// This is used when the schema expects a string but the data contains complex types. +fn value_to_json_string(value: &Value) -> Result { + serde_json::to_string(value).map_err(|e| ArrowEncodingError::Io { + source: std::io::Error::new(std::io::ErrorKind::InvalidData, e), + }) +} + +/// Appends a null value to an array builder based on its type. +fn append_null_to_builder( + builder: &mut dyn ArrayBuilder, + data_type: &DataType, +) -> Result<(), ArrowEncodingError> { + append_null_match!(builder, data_type, { + DataType::Int8 => Int8Builder, + DataType::Int16 => Int16Builder, + DataType::Int32 => Int32Builder, + DataType::Int64 => Int64Builder, + DataType::UInt8 => UInt8Builder, + DataType::UInt16 => UInt16Builder, + DataType::UInt32 => UInt32Builder, + DataType::UInt64 => UInt64Builder, + DataType::Float32 => Float32Builder, + DataType::Float64 => Float64Builder, + DataType::Boolean => BooleanBuilder, + DataType::Utf8 => StringBuilder, + DataType::Binary => BinaryBuilder, + DataType::Timestamp(TimeUnit::Second, _) => TimestampSecondBuilder, + DataType::Timestamp(TimeUnit::Millisecond, _) => TimestampMillisecondBuilder, + DataType::Timestamp(TimeUnit::Microsecond, _) => TimestampMicrosecondBuilder, + DataType::Timestamp(TimeUnit::Nanosecond, _) => TimestampNanosecondBuilder, + DataType::Decimal128(_, _) => Decimal128Builder, + DataType::Decimal256(_, _) => Decimal256Builder, + DataType::List(_) => ListBuilder>, + DataType::Struct(_) => StructBuilder, + }); + + // Special case: Map uses append(false) instead of append_null() + if matches!(data_type, DataType::Map(_, _)) { + downcast_builder!(builder, MapBuilder>) + .append(false) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + + Ok(()) +} + +/// Recursively appends a VRL Value to an Arrow array builder. +fn append_value_to_builder( + builder: &mut dyn ArrayBuilder, + value: &Value, + field: &Field, +) -> Result<(), ArrowEncodingError> { + match (field.data_type(), value) { + // Integer types with range checking + (DataType::Int8, Value::Integer(i)) => { + let val = (*i >= i8::MIN as i64 && *i <= i8::MAX as i64).then_some(*i as i8); + downcast_builder!(builder, Int8Builder).append_option(val); + } + (DataType::Int16, Value::Integer(i)) => { + let val = (*i >= i16::MIN as i64 && *i <= i16::MAX as i64).then_some(*i as i16); + downcast_builder!(builder, Int16Builder).append_option(val); + } + (DataType::Int32, Value::Integer(i)) => { + let val = (*i >= i32::MIN as i64 && *i <= i32::MAX as i64).then_some(*i as i32); + downcast_builder!(builder, Int32Builder).append_option(val); + } + (DataType::Int64, Value::Integer(i)) => { + downcast_builder!(builder, Int64Builder).append_value(*i); + } + + // Unsigned integer types with range checking + (DataType::UInt8, Value::Integer(i)) => { + let val = (*i >= 0 && *i <= u8::MAX as i64).then_some(*i as u8); + downcast_builder!(builder, UInt8Builder).append_option(val); + } + (DataType::UInt16, Value::Integer(i)) => { + let val = (*i >= 0 && *i <= u16::MAX as i64).then_some(*i as u16); + downcast_builder!(builder, UInt16Builder).append_option(val); + } + (DataType::UInt32, Value::Integer(i)) => { + let val = (*i >= 0 && *i <= u32::MAX as i64).then_some(*i as u32); + downcast_builder!(builder, UInt32Builder).append_option(val); + } + (DataType::UInt64, Value::Integer(i)) => { + let val = (*i >= 0).then_some(*i as u64); + downcast_builder!(builder, UInt64Builder).append_option(val); + } + + // Float types + (DataType::Float32, Value::Float(f)) => { + downcast_builder!(builder, Float32Builder).append_value(f.into_inner() as f32); + } + (DataType::Float32, Value::Integer(i)) => { + downcast_builder!(builder, Float32Builder).append_value(*i as f32); + } + (DataType::Float64, Value::Float(f)) => { + downcast_builder!(builder, Float64Builder).append_value(f.into_inner()); + } + (DataType::Float64, Value::Integer(i)) => { + downcast_builder!(builder, Float64Builder).append_value(*i as f64); + } + + // Boolean + (DataType::Boolean, Value::Boolean(b)) => { + downcast_builder!(builder, BooleanBuilder).append_value(*b); + } + // String types + (DataType::Utf8, Value::Bytes(bytes)) => match std::str::from_utf8(bytes) { + Ok(s) => downcast_builder!(builder, StringBuilder).append_value(s), + Err(_) => { + let s = String::from_utf8_lossy(bytes); + downcast_builder!(builder, StringBuilder).append_value(&s) + } + }, + // Object -> String + (DataType::Utf8, Value::Object(obj)) => { + let json_str = value_to_json_string(&Value::Object(obj.clone()))?; + downcast_builder!(builder, StringBuilder).append_value(&json_str); + } + // Array -> String + (DataType::Utf8, Value::Array(arr)) => { + let json_str = value_to_json_string(&Value::Array(arr.clone()))?; + downcast_builder!(builder, StringBuilder).append_value(&json_str); + } + (DataType::Binary, Value::Bytes(bytes)) => { + downcast_builder!(builder, BinaryBuilder).append_value(bytes); + } + + // Timestamp types + (DataType::Timestamp(time_unit, _), value) => { + use chrono::Utc; + + let timestamp_value = match value { + Value::Timestamp(ts) => Some(*ts), + Value::Bytes(bytes) => std::str::from_utf8(bytes) + .ok() + .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok()) + .map(|dt| dt.with_timezone(&Utc)), + _ => None, + }; + + let converted_value = match (time_unit, timestamp_value) { + (TimeUnit::Second, Some(ts)) => Some(ts.timestamp()), + (TimeUnit::Millisecond, Some(ts)) => Some(ts.timestamp_millis()), + (TimeUnit::Microsecond, Some(ts)) => Some(ts.timestamp_micros()), + (TimeUnit::Nanosecond, Some(ts)) => ts.timestamp_nanos_opt(), + _ => { + // Fallback to raw integer if not a timestamp + if let Value::Integer(i) = value { + Some(*i) + } else { + None + } + } + }; + + match time_unit { + TimeUnit::Second => { + downcast_builder!(builder, TimestampSecondBuilder) + .append_option(converted_value); + } + TimeUnit::Millisecond => { + downcast_builder!(builder, TimestampMillisecondBuilder) + .append_option(converted_value); + } + TimeUnit::Microsecond => { + downcast_builder!(builder, TimestampMicrosecondBuilder) + .append_option(converted_value); + } + TimeUnit::Nanosecond => { + downcast_builder!(builder, TimestampNanosecondBuilder) + .append_option(converted_value); + } + } + } + + // Decimal types + (DataType::Decimal128(_precision, scale), value) => { + use rust_decimal::Decimal; + + let target_scale = scale.unsigned_abs() as u32; + + let mantissa = match value { + Value::Float(f) => Decimal::try_from(f.into_inner()).ok().map(|mut d| { + d.rescale(target_scale); + d.mantissa() + }), + Value::Integer(i) => { + let mut decimal = Decimal::from(*i); + decimal.rescale(target_scale); + Some(decimal.mantissa()) + } + _ => None, + }; + + downcast_builder!(builder, Decimal128Builder).append_option(mantissa); + } + + (DataType::Decimal256(_precision, scale), value) => { + use rust_decimal::Decimal; + + let target_scale = scale.unsigned_abs() as u32; + + let mantissa = match value { + Value::Float(f) => Decimal::try_from(f.into_inner()).ok().map(|mut d| { + d.rescale(target_scale); + i256::from_i128(d.mantissa()) + }), + Value::Integer(i) => { + let mut decimal = Decimal::from(*i); + decimal.rescale(target_scale); + Some(i256::from_i128(decimal.mantissa())) + } + _ => None, + }; + + downcast_builder!(builder, Decimal256Builder).append_option(mantissa); + } + + // Complex types + (DataType::List(inner_field), Value::Array(arr)) => { + let list_builder = + downcast_builder!(builder, ListBuilder>, field)?; + + for item in arr.iter() { + append_value_to_builder(list_builder.values(), item, inner_field)?; + } + list_builder.append(true); + } + + (DataType::Struct(fields), Value::Object(obj)) => { + let struct_builder = downcast_builder!(builder, StructBuilder, field)?; + + for (i, field) in fields.iter().enumerate() { + // Use the actual field name from the schema + // This supports both named tuples and unnamed tuples (which use "f0", "f1", etc.) + let key = field.name(); + let field_builder = &mut struct_builder.field_builders_mut()[i]; + match obj.get(key.as_str()) { + Some(val) => append_value_to_builder(field_builder.as_mut(), val, field)?, + None => append_null_to_builder(field_builder.as_mut(), field.data_type())?, + } + } + struct_builder.append(true); + } + + (DataType::Map(entries_field, _), Value::Object(obj)) => { + let map_builder = downcast_builder!(builder, MapBuilder>, field)?; + + let DataType::Struct(entries_struct) = entries_field.data_type() else { + return Err(ArrowEncodingError::UnsupportedType { + field_name: field.name().clone(), + data_type: field.data_type().clone(), + }); + }; + + let value_field = &entries_struct[1]; + for (key, value) in obj.iter() { + map_builder.keys().append_value(key.as_ref()); + append_value_to_builder(map_builder.values(), value, value_field)?; + } + map_builder + .append(true) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + + // Unsupported type/value combinations + _ => { + if !is_supported_type(field.data_type()) { + return Err(ArrowEncodingError::UnsupportedType { + field_name: field.name().clone(), + data_type: field.data_type().clone(), + }); + } + + // Supported type but value is missing/incompatible + if field.is_nullable() { + append_null_to_builder(builder, field.data_type())?; + } else { + return Err(ArrowEncodingError::NullConstraint { + field_name: field.name().clone(), + }); + } + } + } + Ok(()) +} + +fn build_array_for_field(events: &[Event], field: &Field) -> Result { + let mut builder = create_array_builder_for_type(field.data_type(), events.len())?; + + events.iter().try_for_each(|event| { + let Event::Log(log) = event else { + return Ok(()); + }; + + match log.get(field.name().as_str()) { + Some(value) => append_value_to_builder(builder.as_mut(), value, field), + None if field.is_nullable() => { + append_null_to_builder(builder.as_mut(), field.data_type()) + } + None => Err(ArrowEncodingError::NullConstraint { + field_name: field.name().clone(), + }), + } + })?; + + Ok(builder.finish()) +} + +/// Builds an Arrow RecordBatch from events +pub(crate) fn build_record_batch( + schema: SchemaRef, + events: &[Event], +) -> Result { + let columns: Vec = schema + .fields() + .iter() + .map(|field| build_array_for_field(events, field)) + .collect::>()?; + + RecordBatch::try_new(schema, columns) + .map_err(|source| ArrowEncodingError::RecordBatchCreation { source }) +} diff --git a/lib/codecs/src/encoding/format/arrow/mod.rs b/lib/codecs/src/encoding/format/arrow/mod.rs new file mode 100644 index 0000000000000..c21c65a68bdb5 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/mod.rs @@ -0,0 +1,270 @@ +//! Arrow IPC streaming format codec for batched event encoding +//! +//! Provides Apache Arrow IPC stream format encoding with static schema support. +//! This implements the streaming variant of the Arrow IPC protocol, which writes +//! a continuous stream of record batches without a file footer. + +mod builder; +mod types; + +#[cfg(test)] +mod tests; + +use arrow::{ + datatypes::{DataType, FieldRef, Schema, SchemaRef}, + ipc::writer::StreamWriter, +}; +use async_trait::async_trait; +use bytes::{BufMut, Bytes, BytesMut}; +use snafu::Snafu; +use std::sync::Arc; +use vector_config::configurable_component; + +use builder::build_record_batch; + +/// Provides Arrow schema for encoding. +/// +/// Sinks can implement this trait to provide custom schema fetching logic. +#[async_trait] +pub trait SchemaProvider: Send + Sync + std::fmt::Debug { + /// Fetch the Arrow schema from the data store. + /// + /// This is called during sink configuration build phase to fetch + /// the schema once at startup, rather than at runtime. + async fn get_schema(&self) -> Result; +} + +/// Configuration for Arrow IPC stream serialization +#[configurable_component] +#[derive(Clone, Default)] +pub struct ArrowStreamSerializerConfig { + /// The Arrow schema to use for encoding + #[serde(skip)] + #[configurable(derived)] + pub schema: Option, + + /// Allow null values for non-nullable fields in the schema. + /// + /// When enabled, missing or incompatible values will be encoded as null even for fields + /// marked as non-nullable in the Arrow schema. This is useful when working with downstream + /// systems that can handle null values through defaults, computed columns, or other mechanisms. + /// + /// When disabled (default), missing values for non-nullable fields will cause encoding errors, + /// ensuring all required data is present before sending to the sink. + #[serde(default)] + #[configurable(derived)] + pub allow_nullable_fields: bool, +} + +impl std::fmt::Debug for ArrowStreamSerializerConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ArrowStreamSerializerConfig") + .field( + "schema", + &self + .schema + .as_ref() + .map(|s| format!("{} fields", s.fields().len())), + ) + .field("allow_nullable_fields", &self.allow_nullable_fields) + .finish() + } +} + +impl ArrowStreamSerializerConfig { + /// Create a new ArrowStreamSerializerConfig with a schema + pub fn new(schema: arrow::datatypes::Schema) -> Self { + Self { + schema: Some(schema), + allow_nullable_fields: false, + } + } + + /// The data type of events that are accepted by `ArrowStreamEncoder`. + pub fn input_type(&self) -> vector_core::config::DataType { + vector_core::config::DataType::Log + } + + /// The schema required by the serializer. + pub fn schema_requirement(&self) -> vector_core::schema::Requirement { + vector_core::schema::Requirement::empty() + } +} + +/// Arrow IPC stream batch serializer that holds the schema +#[derive(Clone, Debug)] +pub struct ArrowStreamSerializer { + schema: SchemaRef, +} + +impl ArrowStreamSerializer { + /// Create a new ArrowStreamSerializer with the given configuration + pub fn new(config: ArrowStreamSerializerConfig) -> Result { + let schema = config + .schema + .ok_or_else(|| vector_common::Error::from("Arrow serializer requires a schema."))?; + + // If allow_nullable_fields is enabled, transform the schema once here + // instead of on every batch encoding + let schema = if config.allow_nullable_fields { + Schema::new_with_metadata( + schema + .fields() + .iter() + .map(|f| make_field_nullable(f).into()) + .collect::>(), + schema.metadata().clone(), + ) + } else { + schema + }; + + Ok(Self { + schema: SchemaRef::new(schema), + }) + } +} + +impl tokio_util::codec::Encoder> for ArrowStreamSerializer { + type Error = ArrowEncodingError; + + fn encode( + &mut self, + events: Vec, + buffer: &mut BytesMut, + ) -> Result<(), Self::Error> { + if events.is_empty() { + return Err(ArrowEncodingError::NoEvents); + } + + let bytes = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&self.schema)))?; + + buffer.extend_from_slice(&bytes); + Ok(()) + } +} + +/// Errors that can occur during Arrow encoding +#[derive(Debug, Snafu)] +pub enum ArrowEncodingError { + /// Failed to create Arrow record batch + #[snafu(display("Failed to create Arrow record batch: {}", source))] + RecordBatchCreation { + /// The underlying Arrow error + source: arrow::error::ArrowError, + }, + + /// Failed to write Arrow IPC data + #[snafu(display("Failed to write Arrow IPC data: {}", source))] + IpcWrite { + /// The underlying Arrow error + source: arrow::error::ArrowError, + }, + + /// No events provided for encoding + #[snafu(display("No events provided for encoding"))] + NoEvents, + + /// Schema must be provided before encoding + #[snafu(display("Schema must be provided before encoding"))] + NoSchemaProvided, + + /// Failed to fetch schema from provider + #[snafu(display("Failed to fetch schema from provider: {}", message))] + SchemaFetchError { + /// Error message from the provider + message: String, + }, + + /// Unsupported Arrow data type for field + #[snafu(display( + "Unsupported Arrow data type for field '{}': {:?}", + field_name, + data_type + ))] + UnsupportedType { + /// The field name + field_name: String, + /// The unsupported data type + data_type: DataType, + }, + + /// Null value encountered for non-nullable field + #[snafu(display("Null value for non-nullable field '{}'", field_name))] + NullConstraint { + /// The field name + field_name: String, + }, + + /// IO error during encoding + #[snafu(display("IO error: {}", source))] + Io { + /// The underlying IO error + source: std::io::Error, + }, +} + +impl From for ArrowEncodingError { + fn from(error: std::io::Error) -> Self { + Self::Io { source: error } + } +} + +/// Encodes a batch of events into Arrow IPC streaming format +pub fn encode_events_to_arrow_ipc_stream( + events: &[vector_core::event::Event], + schema: Option, +) -> Result { + if events.is_empty() { + return Err(ArrowEncodingError::NoEvents); + } + + let schema_ref = schema.ok_or(ArrowEncodingError::NoSchemaProvided)?; + + let record_batch = build_record_batch(schema_ref, events)?; + + let ipc_err = |source| ArrowEncodingError::IpcWrite { source }; + + let mut buffer = BytesMut::new().writer(); + let mut writer = + StreamWriter::try_new(&mut buffer, record_batch.schema_ref()).map_err(ipc_err)?; + writer.write(&record_batch).map_err(ipc_err)?; + writer.finish().map_err(ipc_err)?; + + Ok(buffer.into_inner().freeze()) +} + +/// Recursively makes a Field and all its nested fields nullable +fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field { + let new_data_type = match field.data_type() { + DataType::List(inner_field) => DataType::List(make_field_nullable(inner_field).into()), + DataType::Struct(fields) => { + DataType::Struct(fields.iter().map(|f| make_field_nullable(f)).collect()) + } + DataType::Map(inner, sorted) => { + // A Map's inner field is typically a "entries" Struct + let DataType::Struct(fields) = inner.data_type() else { + // Fallback for invalid Map structures (preserves original) + return field.clone().with_nullable(true); + }; + + let new_struct_fields = vec![fields[0].clone(), make_field_nullable(&fields[1]).into()]; + + // Reconstruct the inner "entries" field + // The inner field itself must be non-nullable (only the Map wrapper is nullable) + let new_inner_field = inner + .as_ref() + .clone() + .with_data_type(DataType::Struct(new_struct_fields.into())) + .with_nullable(false); + + DataType::Map(new_inner_field.into(), *sorted) + } + other => other.clone(), + }; + + field + .clone() + .with_data_type(new_data_type) + .with_nullable(true) +} diff --git a/lib/codecs/src/encoding/format/arrow/tests.rs b/lib/codecs/src/encoding/format/arrow/tests.rs new file mode 100644 index 0000000000000..345da25c0b326 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/tests.rs @@ -0,0 +1,1886 @@ +use super::*; +use arrow::{ + array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Float64Array, Int64Array, ListArray, MapArray, + StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, + }, + datatypes::{DataType, Field, Fields, Schema, SchemaRef, TimeUnit}, + ipc::reader::StreamReader, + record_batch::RecordBatch, +}; +use chrono::Utc; +use std::{io::Cursor, sync::Arc}; +use vector_core::event::{Event, LogEvent, Value}; + +/// Helper to encode events and return the decoded RecordBatch +fn encode_and_decode( + events: Vec, + schema: SchemaRef, +) -> Result> { + let bytes = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema)))?; + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None)?; + Ok(reader.next().unwrap()?) +} + +/// Create a simple event from key-value pairs +fn create_event(fields: Vec<(&str, V)>) -> Event +where + V: Into, +{ + let mut log = LogEvent::default(); + for (key, value) in fields { + log.insert(key, value.into()); + } + Event::Log(log) +} + +/// Assert a column has expected integer values (with optional nulls) +fn assert_int64_column(batch: &RecordBatch, col_index: usize, expected: &[Option]) { + let array = batch + .column(col_index) + .as_any() + .downcast_ref::() + .expect("Expected Int64Array"); + + assert_eq!( + array.len(), + expected.len(), + "Array length mismatch at column {}", + col_index + ); + + for (i, &expected_val) in expected.iter().enumerate() { + match expected_val { + Some(val) => { + assert!( + !array.is_null(i), + "Expected value {} at index {}, got null", + val, + i + ); + assert_eq!(array.value(i), val, "Value mismatch at index {}", i); + } + None => assert!(array.is_null(i), "Expected null at index {}, got value", i), + } + } +} + +/// Create a schema with a single field +fn single_field_schema(name: &str, data_type: DataType, nullable: bool) -> SchemaRef { + SchemaRef::new(Schema::new(vec![Field::new(name, data_type, nullable)])) +} + +/// Assert a primitive value at a specific column and row +macro_rules! assert_primitive_value { + ($batch:expr, $col:expr, $row:expr, $array_type:ty, $expected:expr) => { + assert_eq!( + $batch + .column($col) + .as_any() + .downcast_ref::<$array_type>() + .unwrap() + .value($row), + $expected + ) + }; +} + +mod comprehensive { + use super::*; + + #[test] + fn test_encode_all_types() { + use arrow::array::{ + Decimal128Array, ListArray, MapArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array, + }; + use vrl::value::ObjectMap; + + let now = Utc::now(); + + // Create a struct (tuple) value with unnamed fields + let mut tuple_value = ObjectMap::new(); + tuple_value.insert("f0".into(), Value::Bytes("nested_str".into())); + tuple_value.insert("f1".into(), Value::Integer(999)); + + // Create a named struct (named tuple) value + let mut named_tuple_value = ObjectMap::new(); + named_tuple_value.insert("category".into(), Value::Bytes("test_category".into())); + named_tuple_value.insert("tag".into(), Value::Bytes("test_tag".into())); + + // Create a list value + let list_value = Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]); + + // Create a map value + let mut map_value = ObjectMap::new(); + map_value.insert("key1".into(), Value::Integer(100)); + map_value.insert("key2".into(), Value::Integer(200)); + + let mut log = LogEvent::default(); + // Primitive types + log.insert("string_field", "test"); + log.insert("int8_field", 127); + log.insert("int16_field", 32000); + log.insert("int32_field", 1000000); + log.insert("int64_field", 42); + log.insert("uint8_field", 255); + log.insert("uint16_field", 65535); + log.insert("uint32_field", 4000000); + log.insert("uint64_field", 9000000000_i64); + log.insert("float32_field", 3.15); + log.insert("float64_field", 3.15); + log.insert("bool_field", true); + log.insert("bytes_field", bytes::Bytes::from("binary")); + log.insert("timestamp_field", now); + log.insert("decimal_field", 99.99); + // Complex types + log.insert("list_field", list_value); + log.insert("struct_field", Value::Object(tuple_value)); + log.insert("named_struct_field", Value::Object(named_tuple_value)); + log.insert("map_field", Value::Object(map_value)); + + let events = vec![Event::Log(log)]; + + // Build schema with all supported types + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int64, true), + ]); + + let named_struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("category", DataType::Utf8, true), + Field::new("tag", DataType::Utf8, true), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int64, true), + ])), + false, + ); + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("string_field", DataType::Utf8, true), + Field::new("int8_field", DataType::Int8, true), + Field::new("int16_field", DataType::Int16, true), + Field::new("int32_field", DataType::Int32, true), + Field::new("int64_field", DataType::Int64, true), + Field::new("uint8_field", DataType::UInt8, true), + Field::new("uint16_field", DataType::UInt16, true), + Field::new("uint32_field", DataType::UInt32, true), + Field::new("uint64_field", DataType::UInt64, true), + Field::new("float32_field", DataType::Float32, true), + Field::new("float64_field", DataType::Float64, true), + Field::new("bool_field", DataType::Boolean, true), + Field::new("bytes_field", DataType::Binary, true), + Field::new( + "timestamp_field", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new("decimal_field", DataType::Decimal128(10, 2), true), + Field::new( + "list_field", + DataType::List(Field::new("item", DataType::Int64, true).into()), + true, + ), + Field::new("struct_field", DataType::Struct(struct_fields), true), + Field::new( + "named_struct_field", + DataType::Struct(named_struct_fields), + true, + ), + Field::new("map_field", DataType::Map(map_entries.into(), false), true), + ])); + + let batch = encode_and_decode(events, schema).expect("Failed to encode"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 19); + + // Verify all primitive types + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + "test" + ); + assert_primitive_value!(batch, 1, 0, arrow::array::Int8Array, 127); + assert_primitive_value!(batch, 2, 0, arrow::array::Int16Array, 32000); + assert_primitive_value!(batch, 3, 0, arrow::array::Int32Array, 1000000); + assert_primitive_value!(batch, 4, 0, Int64Array, 42); + assert_primitive_value!(batch, 5, 0, UInt8Array, 255); + assert_primitive_value!(batch, 6, 0, UInt16Array, 65535); + assert_primitive_value!(batch, 7, 0, UInt32Array, 4000000); + assert_primitive_value!(batch, 8, 0, UInt64Array, 9000000000); + assert!( + (batch + .column(9) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + - 3.15) + .abs() + < 0.001 + ); + assert!( + (batch + .column(10) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + - 3.15) + .abs() + < 0.001 + ); + assert!( + batch + .column(11) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_primitive_value!(batch, 12, 0, BinaryArray, b"binary"); + assert_primitive_value!( + batch, + 13, + 0, + TimestampMillisecondArray, + now.timestamp_millis() + ); + assert_primitive_value!(batch, 14, 0, Decimal128Array, 9999); + + let list_array = batch + .column(15) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 3); + let int_array = list_value.as_any().downcast_ref::().unwrap(); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); + + // Verify struct field (unnamed) + let struct_array = batch + .column(16) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!struct_array.is_null(0)); + assert_primitive_value!(struct_array, 0, 0, StringArray, "nested_str"); + assert_primitive_value!(struct_array, 1, 0, Int64Array, 999); + + // Verify named struct field (named tuple) + let named_struct_array = batch + .column(17) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!named_struct_array.is_null(0)); + assert_primitive_value!(named_struct_array, 0, 0, StringArray, "test_category"); + assert_primitive_value!(named_struct_array, 1, 0, StringArray, "test_tag"); + + // Verify map field + let map_array = batch + .column(18) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); + } +} + +mod edge_cases { + use super::*; + + #[test] + fn test_encode_null_values() { + let events = vec![ + create_event(vec![("field_a", 1_i64)]), + create_event(vec![("field_b", 2_i64)]), + ]; + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("field_a", DataType::Int64, true), + Field::new("field_b", DataType::Int64, true), + ])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 2); + assert_int64_column(&batch, 0, &[Some(1), None]); + assert_int64_column(&batch, 1, &[None, Some(2)]); + } + + #[test] + fn test_encode_type_mismatches() { + let events = vec![ + create_event(vec![("field", 42_i64)]), + create_event(vec![("field", 3.15_f64)]), // Type mismatch! + ]; + + let schema = single_field_schema("field", DataType::Int64, true); + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 2); + // Type mismatch becomes null + assert_int64_column(&batch, 0, &[Some(42), None]); + } + + #[test] + fn test_encode_empty_arrays_and_maps() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + let empty_array = Vec::::new(); + let empty_map = ObjectMap::new(); + + let mut log = LogEvent::default(); + log.insert("empty_array", Value::Array(empty_array)); + log.insert("empty_map", Value::Object(empty_map)); + log.insert( + "non_empty_array", + Value::Array(vec![Value::Integer(1), Value::Integer(2)]), + ); + + let events = vec![Event::Log(log)]; + + let array_field = Field::new("item", DataType::Int32, true); + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new( + "empty_array", + DataType::List(array_field.clone().into()), + true, + ), + Field::new("empty_map", DataType::Map(map_entries.into(), false), true), + Field::new("non_empty_array", DataType::List(array_field.into()), true), + ])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 3); + + // Verify empty array + let empty_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!empty_array.is_null(0)); + assert_eq!(empty_array.value(0).len(), 0); + + // Verify empty map + let empty_map = batch.column(1).as_any().downcast_ref::().unwrap(); + assert!(!empty_map.is_null(0)); + assert_eq!(empty_map.value(0).len(), 0); + + // Verify non-empty array + let non_empty_array = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!non_empty_array.is_null(0)); + assert_eq!(non_empty_array.value(0).len(), 2); + } +} + +mod json_serialization { + use super::*; + + #[test] + fn test_encode_complex_json_values() { + use serde_json::json; + + let mut log = LogEvent::default(); + log.insert( + "object_field", + json!({"key": "value", "nested": {"count": 42}}), + ); + log.insert("array_field", json!([1, 2, 3])); + + let events = vec![Event::Log(log)]; + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("object_field", DataType::Utf8, true), + Field::new("array_field", DataType::Utf8, true), + ])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let object_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let object_str = object_array.value(0); + assert!(object_str.contains("key")); + assert!(object_str.contains("value")); + + let array_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(array_array.value(0), "[1,2,3]"); + } +} + +mod error_handling { + use super::*; + + #[test] + fn test_encode_unsupported_type() { + let events = vec![create_event(vec![("field", "value")])]; + + let schema = single_field_schema("field", DataType::Duration(TimeUnit::Millisecond), true); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + ArrowEncodingError::UnsupportedType { .. } + )); + } + + #[test] + fn test_encode_without_schema_fails() { + let events = vec![create_event(vec![("message", "hello")])]; + + let result = encode_events_to_arrow_ipc_stream(&events, None); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + ArrowEncodingError::NoSchemaProvided + )); + } + + #[test] + fn test_encode_empty_events() { + let events: Vec = vec![]; + let result = encode_events_to_arrow_ipc_stream(&events, None); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), ArrowEncodingError::NoEvents)); + } +} + +mod temporal_types { + use super::*; + + #[test] + fn test_encode_timestamp_precisions() { + let now = Utc::now(); + let mut log = LogEvent::default(); + log.insert("ts_second", now); + log.insert("ts_milli", now); + log.insert("ts_micro", now); + log.insert("ts_nano", now); + + let events = vec![Event::Log(log)]; + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new( + "ts_second", + DataType::Timestamp(TimeUnit::Second, None), + true, + ), + Field::new( + "ts_milli", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new( + "ts_micro", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "ts_nano", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + ])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 4); + + let ts_second = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_second.is_null(0)); + assert_eq!(ts_second.value(0), now.timestamp()); + + let ts_milli = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_milli.is_null(0)); + assert_eq!(ts_milli.value(0), now.timestamp_millis()); + + let ts_micro = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_micro.is_null(0)); + assert_eq!(ts_micro.value(0), now.timestamp_micros()); + + let ts_nano = batch + .column(3) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_nano.is_null(0)); + assert_eq!(ts_nano.value(0), now.timestamp_nanos_opt().unwrap()); + } + + #[test] + fn test_encode_mixed_timestamp_string_and_native() { + // Test mixing string timestamps with native Timestamp values + let mut log1 = LogEvent::default(); + log1.insert("ts", "2025-10-22T10:18:44.256Z"); // String + + let mut log2 = LogEvent::default(); + log2.insert("ts", Utc::now()); // Native Timestamp + + let mut log3 = LogEvent::default(); + log3.insert("ts", 1729594724256000000_i64); // Integer (nanoseconds) + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 3); + + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // All three should be non-null + assert!(!ts_array.is_null(0)); + assert!(!ts_array.is_null(1)); + assert!(!ts_array.is_null(2)); + + // First one should match the parsed string + let expected = chrono::DateTime::parse_from_rfc3339("2025-10-22T10:18:44.256Z") + .unwrap() + .timestamp_nanos_opt() + .unwrap(); + assert_eq!(ts_array.value(0), expected); + + // Third one should match the integer + assert_eq!(ts_array.value(2), 1729594724256000000_i64); + } + + #[test] + fn test_encode_invalid_string_timestamp() { + // Test that invalid timestamp strings become null + let mut log1 = LogEvent::default(); + log1.insert("timestamp", "not-a-timestamp"); + + let mut log2 = LogEvent::default(); + log2.insert("timestamp", "2025-10-22T10:18:44.256Z"); // Valid + + let mut log3 = LogEvent::default(); + log3.insert("timestamp", "2025-99-99T99:99:99Z"); // Invalid + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 3); + + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // Invalid timestamps should be null + assert!(ts_array.is_null(0)); + assert!(!ts_array.is_null(1)); // Valid one + assert!(ts_array.is_null(2)); + } +} + +mod decimal_types { + use super::*; + + #[test] + fn test_encode_decimal128_from_integer() { + use arrow::array::Decimal128Array; + + let mut log = LogEvent::default(); + // Store quantity as integer: 1000 + log.insert("quantity", 1000_i64); + + let events = vec![Event::Log(log)]; + + // Decimal(10, 3) - will represent 1000 as 1000.000 + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "quantity", + DataType::Decimal128(10, 3), + true, + )])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(!decimal_array.is_null(0)); + // 1000 with scale 3 = 1000 * 10^3 = 1000000 + assert_eq!(decimal_array.value(0), 1000000_i128); + } + + #[test] + fn test_encode_decimal256() { + use arrow::array::Decimal256Array; + + let mut log = LogEvent::default(); + // Very large precision number + log.insert("big_value", 123456789.123456_f64); + + let events = vec![Event::Log(log)]; + + // Decimal256(50, 6) - high precision decimal + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "big_value", + DataType::Decimal256(50, 6), + true, + )])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(!decimal_array.is_null(0)); + // Value should be non-null and encoded + let value = decimal_array.value(0); + assert!(value.to_i128().is_some()); + } + + #[test] + fn test_encode_decimal_null_values() { + use arrow::array::Decimal128Array; + + let mut log1 = LogEvent::default(); + log1.insert("price", 99.99_f64); + + let log2 = LogEvent::default(); + // No price field - should be null + + let mut log3 = LogEvent::default(); + log3.insert("price", 50.00_f64); + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "price", + DataType::Decimal128(10, 2), + true, + )])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 3); + + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // First row: 99.99 + assert!(!decimal_array.is_null(0)); + assert_eq!(decimal_array.value(0), 9999_i128); + + // Second row: null + assert!(decimal_array.is_null(1)); + + // Third row: 50.00 + assert!(!decimal_array.is_null(2)); + assert_eq!(decimal_array.value(2), 5000_i128); + } +} + +mod primitive_types { + use super::*; + + #[test] + fn test_encode_unsigned_integers_with_null_and_overflow() { + use arrow::array::{UInt8Array, UInt32Array}; + + let mut log1 = LogEvent::default(); + log1.insert("uint8_field", 100_i64); + log1.insert("uint32_field", 1000_i64); + + let mut log2 = LogEvent::default(); + log2.insert("uint8_field", 300_i64); // Overflow - should be null + log2.insert("uint32_field", -1_i64); // Negative - should be null + + let log3 = LogEvent::default(); + // Missing fields - should be null + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("uint8_field", DataType::UInt8, true), + Field::new("uint32_field", DataType::UInt32, true), + ])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 3); + + // Check uint8 column + let uint8_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint8_array.value(0), 100_u8); // Valid + assert!(uint8_array.is_null(1)); // Overflow + assert!(uint8_array.is_null(2)); // Missing + + // Check uint32 column + let uint32_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint32_array.value(0), 1000_u32); // Valid + assert!(uint32_array.is_null(1)); // Negative + assert!(uint32_array.is_null(2)); // Missing + } + + #[test] + fn test_encode_non_nullable_field_with_null_value() { + let events = vec![ + create_event(vec![("required_field", 42_i64)]), + LogEvent::default().into(), // Missing required field + ]; + + let schema = single_field_schema("required_field", DataType::Int64, false); + let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); + + assert!(result.is_err()); + match result.unwrap_err() { + ArrowEncodingError::NullConstraint { field_name } => { + assert_eq!(field_name, "required_field"); + } + other => panic!("Expected NullConstraint error, got: {:?}", other), + } + } + + #[test] + fn test_encode_non_nullable_field_all_values_present() { + let events = vec![ + create_event(vec![("id", 1_i64)]), + create_event(vec![("id", 2_i64)]), + create_event(vec![("id", 3_i64)]), + ]; + + let schema = single_field_schema("id", DataType::Int64, false); + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 3); + assert_int64_column(&batch, 0, &[Some(1), Some(2), Some(3)]); + } +} + +mod config_tests { + use super::*; + use tokio_util::codec::Encoder; + + #[test] + fn test_config_allow_nullable_fields_overrides_schema() { + let mut log1 = LogEvent::default(); + log1.insert("strict_field", 42); + let log2 = LogEvent::default(); + let events = vec![Event::Log(log1), Event::Log(log2)]; + + let schema = Schema::new(vec![Field::new("strict_field", DataType::Int64, false)]); + + let mut config = ArrowStreamSerializerConfig::new(schema); + config.allow_nullable_fields = true; + + let mut serializer = + ArrowStreamSerializer::new(config).expect("Failed to create serializer"); + + let mut buffer = BytesMut::new(); + serializer + .encode(events, &mut buffer) + .expect("Encoding should succeed when allow_nullable_fields is true"); + + let cursor = Cursor::new(buffer); + let mut reader = StreamReader::try_new(cursor, None).expect("Failed to create reader"); + let batch = reader.next().unwrap().expect("Failed to read batch"); + + assert_eq!(batch.num_rows(), 2); + + let binding = batch.schema(); + let output_field = binding.field(0); + assert!( + output_field.is_nullable(), + "The output schema field should have been transformed to nullable=true" + ); + + let array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(array.value(0), 42); + assert!(!array.is_null(0)); + assert!( + array.is_null(1), + "The missing value should be encoded as null" + ); + } + + #[test] + fn test_make_field_nullable_with_nested_types() { + let inner_struct_field = Field::new("nested_field", DataType::Int64, false); + let inner_struct = + DataType::Struct(arrow::datatypes::Fields::from(vec![inner_struct_field])); + let list_field = Field::new("item", inner_struct, false); + let list_type = DataType::List(list_field.into()); + let outer_field = Field::new("inner_list", list_type, false); + let outer_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![outer_field])); + + let original_field = Field::new("root", outer_struct, false); + let nullable_field = make_field_nullable(&original_field); + + assert!( + nullable_field.is_nullable(), + "Root field should be nullable" + ); + + if let DataType::Struct(root_fields) = nullable_field.data_type() { + let inner_list_field = &root_fields[0]; + assert!(inner_list_field.is_nullable()); + + if let DataType::List(list_item_field) = inner_list_field.data_type() { + assert!(list_item_field.is_nullable()); + + if let DataType::Struct(inner_struct_fields) = list_item_field.data_type() { + let nested_field = &inner_struct_fields[0]; + assert!(nested_field.is_nullable()); + } else { + panic!("Expected Struct type for list items"); + } + } else { + panic!("Expected List type for inner_list"); + } + } else { + panic!("Expected Struct type for root field"); + } + } + + #[test] + fn test_make_field_nullable_with_map_type() { + let key_field = Field::new("key", DataType::Utf8, false); + let value_field = Field::new("value", DataType::Int64, false); + let entries_struct = + DataType::Struct(arrow::datatypes::Fields::from(vec![key_field, value_field])); + let entries_field = Field::new("entries", entries_struct, false); + let map_type = DataType::Map(entries_field.into(), false); + + let original_field = Field::new("my_map", map_type, false); + let nullable_field = make_field_nullable(&original_field); + + assert!( + nullable_field.is_nullable(), + "Root map field should be nullable" + ); + + if let DataType::Map(entries_field, _sorted) = nullable_field.data_type() { + assert!( + !entries_field.is_nullable(), + "Map entries field should be non-nullable" + ); + + if let DataType::Struct(struct_fields) = entries_field.data_type() { + let key_field = &struct_fields[0]; + let value_field = &struct_fields[1]; + assert!( + !key_field.is_nullable(), + "Map key field should be non-nullable" + ); + assert!( + value_field.is_nullable(), + "Map value field should be nullable" + ); + } else { + panic!("Expected Struct type for map entries"); + } + } else { + panic!("Expected Map type for my_map field"); + } + } +} + +mod nested_types { + use super::*; + + #[test] + fn test_encode_nested_maps() { + use arrow::array::MapArray; + use vrl::value::ObjectMap; + + // Create nested map: Map> + // {"outer_key1": {"inner_key1": 100, "inner_key2": 200}, "outer_key2": {"inner_key3": 300}} + let mut inner_map1 = ObjectMap::new(); + inner_map1.insert("inner_key1".into(), Value::Integer(100)); + inner_map1.insert("inner_key2".into(), Value::Integer(200)); + + let mut inner_map2 = ObjectMap::new(); + inner_map2.insert("inner_key3".into(), Value::Integer(300)); + + let mut outer_map = ObjectMap::new(); + outer_map.insert("outer_key1".into(), Value::Object(inner_map1)); + outer_map.insert("outer_key2".into(), Value::Object(inner_map2)); + + let mut log = LogEvent::default(); + log.insert("nested_map", Value::Object(outer_map)); + + let events = vec![Event::Log(log)]; + + // Define schema: Map> + // Note: MapBuilder uses "keys" and "values" (plural) as field names + let inner_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + let inner_map_type = DataType::Map(inner_map_entries.into(), false); + + let outer_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", inner_map_type, true), + ])), + false, + ); + let outer_map_type = DataType::Map(outer_map_entries.into(), false); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "nested_map", + outer_map_type, + true, + )])); + + let batch = encode_and_decode(events, schema).expect("Failed to encode nested maps"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the outer map exists + let outer_map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!(outer_map_array.len(), 1); + assert!(!outer_map_array.is_null(0), "Outer map should not be null"); + + // Get the outer map's values (which are inner maps) + let outer_map_value = outer_map_array.value(0); + assert_eq!(outer_map_value.len(), 2, "Outer map should have 2 entries"); + + // The outer map's values are themselves a MapArray + let inner_maps = outer_map_array.values(); + let inner_maps_array = inner_maps.as_any().downcast_ref::().unwrap(); + + // Verify we have 2 inner maps (one for each outer key) + // Total entries across both inner maps: 2 + 1 = 3 + assert_eq!(inner_maps_array.len(), 2, "Should have 2 inner maps"); + + // Verify first inner map has 2 entries + let first_inner_map = inner_maps_array.value(0); + assert_eq!( + first_inner_map.len(), + 2, + "First inner map should have 2 entries" + ); + + // Verify second inner map has 1 entry + let second_inner_map = inner_maps_array.value(1); + assert_eq!( + second_inner_map.len(), + 1, + "Second inner map should have 1 entry" + ); + } + + #[test] + fn test_encode_array_of_maps() { + use arrow::array::ListArray; + use vrl::value::ObjectMap; + + // Create array of maps: Array> + // [{"key1": 100, "key2": 200}, {"key3": 300}] + let mut map1 = ObjectMap::new(); + map1.insert("key1".into(), Value::Integer(100)); + map1.insert("key2".into(), Value::Integer(200)); + + let mut map2 = ObjectMap::new(); + map2.insert("key3".into(), Value::Integer(300)); + + let array_of_maps = Value::Array(vec![Value::Object(map1), Value::Object(map2)]); + + let mut log = LogEvent::default(); + log.insert("array_of_maps", array_of_maps); + + let events = vec![Event::Log(log)]; + + // Define schema: List> + // Note: MapBuilder uses "keys" and "values" (plural) as field names + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + let map_type = DataType::Map(map_entries.into(), false); + let list_field = Field::new("item", map_type, true); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "array_of_maps", + DataType::List(list_field.into()), + true, + )])); + + let batch = encode_and_decode(events, schema).expect("Failed to encode array of maps"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the array exists + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0), "Array should not be null"); + assert_eq!(list_array.value(0).len(), 2, "Array should have 2 maps"); + + // Verify the maps inside the array + let maps = list_array.value(0); + let map_array = maps + .as_any() + .downcast_ref::() + .unwrap(); + + // First map should have 2 entries + let first_map = map_array.value(0); + assert_eq!(first_map.len(), 2, "First map should have 2 entries"); + + // Second map should have 1 entry + let second_map = map_array.value(1); + assert_eq!(second_map.len(), 1, "Second map should have 1 entry"); + } + + #[test] + fn test_encode_array_of_structs() { + use arrow::array::ListArray; + use vrl::value::ObjectMap; + + // Create array of structs (tuples): Array + // [{"f0": "value1", "f1": 100}, {"f0": "value2", "f1": 200}] + let mut tuple1 = ObjectMap::new(); + tuple1.insert("f0".into(), Value::Bytes("value1".into())); + tuple1.insert("f1".into(), Value::Integer(100)); + + let mut tuple2 = ObjectMap::new(); + tuple2.insert("f0".into(), Value::Bytes("value2".into())); + tuple2.insert("f1".into(), Value::Integer(200)); + + let array_of_structs = Value::Array(vec![Value::Object(tuple1), Value::Object(tuple2)]); + + let mut log = LogEvent::default(); + log.insert("array_of_structs", array_of_structs); + + let events = vec![Event::Log(log)]; + + // Define schema: List + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int32, true), + ]); + let struct_type = DataType::Struct(struct_fields); + let list_field = Field::new("item", struct_type, true); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "array_of_structs", + DataType::List(list_field.into()), + true, + )])); + + let batch = encode_and_decode(events, schema).expect("Failed to encode array of structs"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the array exists and has the correct number of elements + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0), "Array should not be null"); + assert_eq!(list_array.value(0).len(), 2, "Array should have 2 structs"); + + // Verify the structs inside the array + let struct_array = list_array.value(0); + let struct_array = struct_array + .as_any() + .downcast_ref::() + .unwrap(); + + // Check first struct field (f0 - strings) + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), "value1"); + assert_eq!(f0_array.value(1), "value2"); + + // Check second struct field (f1 - integers) + let f1_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f1_array.value(0), 100); + assert_eq!(f1_array.value(1), 200); + } + + #[test] + fn test_encode_deep_nesting() { + use arrow::array::ListArray; + + // Create deeply nested array structure (6 levels): + // Array -> Array -> Array -> Array -> Array -> Int32 + let level_5 = Value::Array(vec![Value::Integer(42), Value::Integer(99)]); + let level_4 = Value::Array(vec![level_5]); + let level_3 = Value::Array(vec![level_4]); + let level_2 = Value::Array(vec![level_3]); + let level_1 = Value::Array(vec![level_2]); + + let mut log = LogEvent::default(); + log.insert("deep_array", level_1); + + let events = vec![Event::Log(log)]; + + // Define schema for deep array nesting (6 levels total) + let mut current_field = Field::new("item", DataType::Int32, true); + for _ in 0..5 { + current_field = Field::new("item", DataType::List(current_field.into()), true); + } + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "deep_array", + current_field.data_type().clone(), + true, + )])); + + let batch = + encode_and_decode(events, schema).expect("Failed to encode deeply nested arrays"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify deep array by navigating down through all levels + // Store intermediate arrays to avoid lifetime issues + let mut arrays: Vec = Vec::new(); + arrays.push(batch.column(0).clone()); + + // Navigate through 5 nested List levels + for level in 0..5 { + let list_array = arrays[level] + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("Expected ListArray at level {}", level)); + assert!( + !list_array.is_null(0), + "Array should not be null at level {}", + level + ); + assert_eq!( + list_array.len(), + 1, + "Array should have 1 element at level {}", + level + ); + arrays.push(list_array.value(0)); + } + + // Final level (level 5) should be Int32Array with values [42, 99] + let int_array = arrays[5] + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_array.len(), 2, "Final array should have 2 elements"); + assert_eq!(int_array.value(0), 42); + assert_eq!(int_array.value(1), 99); + } + + #[test] + fn test_encode_struct_with_list_and_map() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a struct containing both a list and a map + // Struct { list_field: [1, 2, 3], map_field: {"k1": 10, "k2": 20} } + let mut struct_value = ObjectMap::new(); + struct_value.insert( + "f0".into(), + Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]), + ); + + let mut map_value = ObjectMap::new(); + map_value.insert("k1".into(), Value::Integer(10)); + map_value.insert("k2".into(), Value::Integer(20)); + struct_value.insert("f1".into(), Value::Object(map_value)); + + let mut log = LogEvent::default(); + log.insert("complex_struct", Value::Object(struct_value)); + + let events = vec![Event::Log(log)]; + + // Define schema: Struct { list_field: List, map_field: Map } + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new( + "f0", + DataType::List(Field::new("item", DataType::Int32, true).into()), + true, + ), + Field::new("f1", DataType::Map(map_entries.into(), false), true), + ]); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "complex_struct", + DataType::Struct(struct_fields), + true, + )])); + + let batch = + encode_and_decode(events, schema).expect("Failed to encode struct with list and map"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the struct + let struct_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!struct_array.is_null(0)); + + // Verify the list inside the struct (f0) + let list_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 3); + let int_array = list_value + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); + + // Verify the map inside the struct (f1) + let map_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); + } + + #[test] + fn test_encode_map_with_struct_values() { + use arrow::array::MapArray; + use vrl::value::ObjectMap; + + // Create a map where values are structs + // Map + // {"item1": {"f0": "Alice", "f1": 10}, "item2": {"f0": "Bob", "f1": 20}} + let mut struct1 = ObjectMap::new(); + struct1.insert("f0".into(), Value::Bytes("Alice".into())); + struct1.insert("f1".into(), Value::Integer(10)); + + let mut struct2 = ObjectMap::new(); + struct2.insert("f0".into(), Value::Bytes("Bob".into())); + struct2.insert("f1".into(), Value::Integer(20)); + + let mut map_value = ObjectMap::new(); + map_value.insert("item1".into(), Value::Object(struct1)); + map_value.insert("item2".into(), Value::Object(struct2)); + + let mut log = LogEvent::default(); + log.insert("map_with_structs", Value::Object(map_value)); + + let events = vec![Event::Log(log)]; + + // Define schema: Map + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int32, true), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Struct(struct_fields), true), + ])), + false, + ); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "map_with_structs", + DataType::Map(map_entries.into(), false), + true, + )])); + + let batch = + encode_and_decode(events, schema).expect("Failed to encode map with struct values"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the map + let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); + + // Verify the struct values in the map + let struct_array = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_array.len(), 2); + + // Check f0 field (names) + let names_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let name1 = names_array.value(0); + let name2 = names_array.value(1); + assert!(name1 == "Alice" || name1 == "Bob"); + assert!(name2 == "Alice" || name2 == "Bob"); + assert_ne!(name1, name2); + + // Check f1 field (counts) + let counts_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(counts_array.value(0) == 10 || counts_array.value(0) == 20); + assert!(counts_array.value(1) == 10 || counts_array.value(1) == 20); + } + + #[test] + fn test_encode_list_of_structs_containing_maps() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a list of structs, where each struct contains a map + // List }> + // [ + // {"f0": 1, "f1": {"color": "red", "size": "large"}}, + // {"f0": 2, "f1": {"color": "blue", "size": "small"}} + // ] + let mut attrs1 = ObjectMap::new(); + attrs1.insert("color".into(), Value::Bytes("red".into())); + attrs1.insert("size".into(), Value::Bytes("large".into())); + + let mut struct1 = ObjectMap::new(); + struct1.insert("f0".into(), Value::Integer(1)); + struct1.insert("f1".into(), Value::Object(attrs1)); + + let mut attrs2 = ObjectMap::new(); + attrs2.insert("color".into(), Value::Bytes("blue".into())); + attrs2.insert("size".into(), Value::Bytes("small".into())); + + let mut struct2 = ObjectMap::new(); + struct2.insert("f0".into(), Value::Integer(2)); + struct2.insert("f1".into(), Value::Object(attrs2)); + + let list_value = Value::Array(vec![Value::Object(struct1), Value::Object(struct2)]); + + let mut log = LogEvent::default(); + log.insert("list_of_structs_with_maps", list_value); + + let events = vec![Event::Log(log)]; + + // Define schema + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + ); + + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Int32, true), + Field::new("f1", DataType::Map(map_entries.into(), false), true), + ]); + + let list_field = Field::new("item", DataType::Struct(struct_fields), true); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "list_of_structs_with_maps", + DataType::List(list_field.into()), + true, + )])); + + let batch = + encode_and_decode(events, schema).expect("Failed to encode list of structs with maps"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the list + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 2); + + // Verify the structs in the list + let struct_array = list_value + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_array.len(), 2); + + // Verify IDs (f0) + let id_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_array.value(0), 1); + assert_eq!(id_array.value(1), 2); + + // Verify maps (f1) + let map_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(map_array.len(), 2); + assert!(!map_array.is_null(0)); + assert!(!map_array.is_null(1)); + + // Verify first map has 2 entries + let first_map = map_array.value(0); + assert_eq!(first_map.len(), 2); + + // Verify second map has 2 entries + let second_map = map_array.value(1); + assert_eq!(second_map.len(), 2); + } + + #[test] + fn test_encode_deeply_nested_mixed_types() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a very complex nested structure: + // Struct { + // data: List, metadata: Map }>> + // } + let mut metadata = ObjectMap::new(); + metadata.insert("key1".into(), Value::Bytes("value1".into())); + + let mut inner_struct = ObjectMap::new(); + inner_struct.insert("f0".into(), Value::Array(vec![Value::Integer(100)])); + inner_struct.insert("f1".into(), Value::Object(metadata)); + + let mut map_in_list = ObjectMap::new(); + map_in_list.insert("item_key".into(), Value::Object(inner_struct)); + + let mut outer_struct = ObjectMap::new(); + outer_struct.insert("f0".into(), Value::Array(vec![Value::Object(map_in_list)])); + + let mut log = LogEvent::default(); + log.insert("deeply_nested", Value::Object(outer_struct)); + + let events = vec![Event::Log(log)]; + + // Define schema + let metadata_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + ); + + let inner_struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new( + "f0", + DataType::List(Field::new("item", DataType::Int32, true).into()), + true, + ), + Field::new( + "f1", + DataType::Map(metadata_map_entries.into(), false), + true, + ), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Struct(inner_struct_fields), true), + ])), + false, + ); + + let list_field = Field::new("item", DataType::Map(map_entries.into(), false), true); + + let outer_struct_fields = arrow::datatypes::Fields::from(vec![Field::new( + "f0", + DataType::List(list_field.into()), + true, + )]); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "deeply_nested", + DataType::Struct(outer_struct_fields), + true, + )])); + + let batch = + encode_and_decode(events, schema).expect("Failed to encode deeply nested mixed types"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the outer struct + let outer_struct = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!outer_struct.is_null(0)); + + // Verify the list inside the outer struct + let list_array = outer_struct + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 1); + + // Verify the map inside the list + let map_array = list_value.as_any().downcast_ref::().unwrap(); + assert_eq!(map_array.len(), 1); + assert!(!map_array.is_null(0)); + + // Verify the struct inside the map + let struct_values = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_values.len(), 1); + + // Verify the list inside the struct + let inner_list = struct_values + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!inner_list.is_null(0)); + let inner_list_value = inner_list.value(0); + assert_eq!(inner_list_value.len(), 1); + + // Verify the innermost map + let inner_map = struct_values + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!inner_map.is_null(0)); + let inner_map_value = inner_map.value(0); + assert_eq!(inner_map_value.len(), 1); + } + + #[test] + fn test_automatic_json_serialization_for_array_of_objects() { + use vrl::value::ObjectMap; + + // Create array of objects (like the user's components data) + let mut obj1 = ObjectMap::new(); + obj1.insert("name".into(), Value::Bytes("service.api.v1".into())); + obj1.insert("alias".into(), Value::Bytes("widget-alpha".into())); + obj1.insert("timeout".into(), Value::Integer(60000)); + + let mut obj2 = ObjectMap::new(); + obj2.insert("name".into(), Value::Bytes("service.backend".into())); + obj2.insert("alias".into(), Value::Bytes("widget-beta".into())); + obj2.insert("timeout".into(), Value::Integer(30000)); + + let components = Value::Array(vec![Value::Object(obj1), Value::Object(obj2)]); + + let mut log = LogEvent::default(); + log.insert("components", components); + + let events = vec![Event::Log(log)]; + + // Schema expects Array(String), but we're providing Array(Object) + // The encoder should automatically serialize objects to JSON strings + let schema = Schema::new(vec![Field::new( + "components", + DataType::List(Field::new("item", DataType::Utf8, true).into()), + false, + )]); + + let batch = encode_and_decode(events, Arc::new(schema)) + .expect("Encoding should succeed with automatic JSON serialization"); + + assert_eq!(batch.num_rows(), 1); + + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + + let list_value = list_array.value(0); + let string_array = list_value.as_any().downcast_ref::().unwrap(); + + // Should have 2 strings (JSON serialized objects) + assert_eq!(string_array.len(), 2); + + // Verify the first object was serialized to JSON + let json1 = string_array.value(0); + assert!(json1.contains("\"name\":\"service.api.v1\"")); + assert!(json1.contains("\"alias\":\"widget-alpha\"")); + assert!(json1.contains("\"timeout\":60000")); + + // Verify the second object was serialized to JSON + let json2 = string_array.value(1); + assert!(json2.contains("\"name\":\"service.backend\"")); + assert!(json2.contains("\"alias\":\"widget-beta\"")); + assert!(json2.contains("\"timeout\":30000")); + } + + #[test] + fn test_object_in_map_values_to_string() { + use vrl::value::ObjectMap; + + // Create a map with object values: Map + // Schema expects Map, so objects should serialize to JSON + let mut inner_obj = ObjectMap::new(); + inner_obj.insert("config".into(), Value::Bytes("enabled".into())); + inner_obj.insert("timeout".into(), Value::Integer(5000)); + + let mut map_value = ObjectMap::new(); + map_value.insert("setting1".into(), Value::Object(inner_obj)); + map_value.insert("setting2".into(), Value::Bytes("simple string".into())); + + let mut log = LogEvent::default(); + log.insert("settings", Value::Object(map_value)); + + let events = vec![Event::Log(log)]; + + // Schema: Map (expects string values, but we have objects) + let key_field = Field::new("keys", DataType::Utf8, false); + let value_field = Field::new("values", DataType::Utf8, true); + let entries_struct = DataType::Struct(Fields::from(vec![key_field, value_field])); + let entries_field = Field::new("entries", entries_struct, false); + let map_type = DataType::Map(entries_field.into(), false); + + let schema = Schema::new(vec![Field::new("settings", map_type, false)]); + + let batch = encode_and_decode(events, Arc::new(schema)) + .expect("Map with object values should serialize to JSON strings"); + + assert_eq!(batch.num_rows(), 1); + + let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert!(!map_array.is_null(0)); + + // Get the values from the map + let values_array = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + + // One value should be a JSON object, one should be a plain string + let mut found_json_object = false; + let mut found_plain_string = false; + + for i in 0..values_array.len() { + let value = values_array.value(i); + if value.contains("\"config\"") && value.contains("\"timeout\"") { + found_json_object = true; + } else if value == "simple string" { + found_plain_string = true; + } + } + + assert!( + found_json_object, + "Should find JSON-serialized object in map values" + ); + assert!(found_plain_string, "Should find plain string in map values"); + } + + #[test] + fn test_nested_arrays_with_objects() { + use vrl::value::ObjectMap; + + // Array of arrays, where inner arrays contain objects + let mut obj = ObjectMap::new(); + obj.insert("id".into(), Value::Integer(123)); + + let inner_array = Value::Array(vec![Value::Object(obj.clone())]); + let outer_array = Value::Array(vec![inner_array]); + + let mut log = LogEvent::default(); + log.insert("nested", outer_array); + + let events = vec![Event::Log(log)]; + + // Schema: Array(Array(String)) + let inner_field = Field::new("item", DataType::Utf8, true); + let middle_field = Field::new("item", DataType::List(inner_field.into()), true); + let outer_list = DataType::List(middle_field.into()); + + let schema = Schema::new(vec![Field::new("nested", outer_list, false)]); + + let batch = encode_and_decode(events, Arc::new(schema)) + .expect("Nested arrays with objects should serialize"); + + assert_eq!(batch.num_rows(), 1); + + // Navigate to the deepest array + let outer_list = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let outer_value = outer_list.value(0); + let middle_list = outer_value.as_any().downcast_ref::().unwrap(); + let middle_value = middle_list.value(0); + let inner_strings = middle_value.as_any().downcast_ref::().unwrap(); + + // Should have one JSON string + assert_eq!(inner_strings.len(), 1); + let json_str = inner_strings.value(0); + assert!( + json_str.contains("\"id\":123"), + "Deeply nested object should be serialized to JSON" + ); + } +} diff --git a/lib/codecs/src/encoding/format/arrow/types.rs b/lib/codecs/src/encoding/format/arrow/types.rs new file mode 100644 index 0000000000000..8bc82d9c64e16 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/types.rs @@ -0,0 +1,70 @@ +//! Arrow type to array builder mapping +//! +//! Creates appropriate Arrow array builders for different data types, +//! with special handling for complex nested types (List, Struct, Map). + +use arrow::array::{ + ArrayBuilder, ListBuilder, MapBuilder, StringBuilder, StructBuilder, make_builder, +}; +use arrow::datatypes::DataType; + +use super::ArrowEncodingError; + +const NESTED_CAPACITY_MULTIPLIER: usize = 4; + +/// Creates an array builder for a given Arrow data type. +/// +/// Uses Arrow's `make_builder` for most types, but provides custom handling +/// for complex nested types (List, Struct, Map) to ensure proper recursive +/// builder creation, especially for nested Maps which `make_builder` doesn't +/// fully support. +pub(crate) fn create_array_builder_for_type( + data_type: &DataType, + capacity: usize, +) -> Result, ArrowEncodingError> { + match data_type { + DataType::List(inner_field) => create_list_builder(inner_field.data_type(), capacity), + DataType::Struct(fields) => create_struct_builder(fields, capacity), + DataType::Map(entries_field, _) => create_map_builder(entries_field.data_type(), capacity), + _ => Ok(make_builder(data_type, capacity)), + } +} + +fn create_list_builder( + inner_type: &DataType, + capacity: usize, +) -> Result, ArrowEncodingError> { + let nested_capacity = capacity * NESTED_CAPACITY_MULTIPLIER; + let inner_builder = create_array_builder_for_type(inner_type, nested_capacity)?; + Ok(Box::new(ListBuilder::new(inner_builder))) +} + +fn create_struct_builder( + fields: &arrow::datatypes::Fields, + capacity: usize, +) -> Result, ArrowEncodingError> { + let field_builders = fields + .iter() + .map(|f| create_array_builder_for_type(f.data_type(), capacity)) + .collect::, _>>()?; + Ok(Box::new(StructBuilder::new(fields.clone(), field_builders))) +} + +fn create_map_builder( + entries_type: &DataType, + capacity: usize, +) -> Result, ArrowEncodingError> { + let DataType::Struct(entries_fields) = entries_type else { + return Err(ArrowEncodingError::UnsupportedType { + field_name: "dynamic".into(), + data_type: entries_type.clone(), + }); + }; + + let nested_capacity = capacity * NESTED_CAPACITY_MULTIPLIER; + let key_builder = StringBuilder::with_capacity(nested_capacity, 0); + let value_builder = + create_array_builder_for_type(entries_fields[1].data_type(), nested_capacity)?; + + Ok(Box::new(MapBuilder::new(None, key_builder, value_builder))) +} diff --git a/src/sinks/clickhouse/arrow/parser.rs b/src/sinks/clickhouse/arrow/parser.rs index a13bd823487b5..3b79994ea4968 100644 --- a/src/sinks/clickhouse/arrow/parser.rs +++ b/src/sinks/clickhouse/arrow/parser.rs @@ -1,6 +1,6 @@ //! ClickHouse type parsing and conversion to Arrow types. -use arrow::datatypes::{DataType, TimeUnit}; +use arrow::datatypes::{DataType, Field, Fields, TimeUnit}; const DECIMAL32_PRECISION: u8 = 9; const DECIMAL64_PRECISION: u8 = 18; @@ -16,6 +16,12 @@ pub enum ClickHouseType<'a> { Nullable(Box>), /// LowCardinality(T) LowCardinality(Box>), + /// Array(T) + Array(Box>), + /// Tuple(T1, T2, ...) or Tuple(name1 T1, name2 T2, ...) + Tuple(Vec<(Option<&'a str>, ClickHouseType<'a>)>), + /// Map(K, V) + Map(Box>, Box>), } impl<'a> ClickHouseType<'a> { @@ -38,99 +44,207 @@ impl<'a> ClickHouseType<'a> { _ => self, } } + + /// Converts this structured ClickHouseType to an Arrow DataType. + /// Returns a tuple of (DataType, is_nullable). + pub fn to_arrow(&self) -> Result<(DataType, bool), String> { + let is_nullable = self.is_nullable(); + + let data_type = match self.base_type() { + ClickHouseType::Primitive(name) => { + let (type_name, _) = extract_identifier(name); + match type_name { + // Numeric + "Int8" => DataType::Int8, + "Int16" => DataType::Int16, + "Int32" => DataType::Int32, + "Int64" => DataType::Int64, + "UInt8" => DataType::UInt8, + "UInt16" => DataType::UInt16, + "UInt32" => DataType::UInt32, + "UInt64" => DataType::UInt64, + "Float32" => DataType::Float32, + "Float64" => DataType::Float64, + "Bool" => DataType::Boolean, + "Decimal" | "Decimal32" | "Decimal64" | "Decimal128" | "Decimal256" => { + parse_decimal_type(name)? + } + + // Strings + "String" | "FixedString" => DataType::Utf8, + + // Date and time + "Date" | "Date32" => DataType::Date32, + "DateTime" => DataType::Timestamp(TimeUnit::Second, None), + "DateTime64" => parse_datetime64_precision(name)?, + + _ => return Err(format!("Unknown ClickHouse type '{}'", type_name)), + } + } + ClickHouseType::Array(inner) => { + let (inner_arrow, inner_nullable) = inner.to_arrow()?; + DataType::List(Field::new("item", inner_arrow, inner_nullable).into()) + } + ClickHouseType::Tuple(elements) => { + let fields: Vec = elements + .iter() + .enumerate() + .map(|(i, (name_opt, elem))| { + let (dt, nullable) = elem.to_arrow()?; + + let name = name_opt + .as_deref() + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .unwrap_or_else(|| format!("f{}", i)); + + Ok(Field::new(name, dt, nullable)) + }) + .collect::>()?; + + DataType::Struct(Fields::from(fields)) + } + ClickHouseType::Map(key_type, value_type) => { + let (key_arrow, _) = key_type.to_arrow()?; + + if !matches!(key_arrow, DataType::Utf8) { + return Err("Map keys must be String type.".to_string()); + } + + let (value_arrow, value_nullable) = value_type.to_arrow()?; + + let entries = DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", value_arrow, value_nullable), + ])); + + DataType::Map(Field::new("entries", entries, false).into(), false) + } + _ => return Err("Unsupported ClickHouse type".to_string()), + }; + + Ok((data_type, is_nullable)) + } } /// Parses a ClickHouse type string into a structured representation. pub fn parse_ch_type(ty: &str) -> ClickHouseType<'_> { let ty = ty.trim(); - // Recursively strip and parse type modifiers - if let Some(inner) = strip_wrapper(ty, "Nullable") { - return ClickHouseType::Nullable(Box::new(parse_ch_type(inner))); - } - if let Some(inner) = strip_wrapper(ty, "LowCardinality") { - return ClickHouseType::LowCardinality(Box::new(parse_ch_type(inner))); + // Try to match type_name(args) pattern + if let Some((type_name, args_str)) = try_parse_wrapper(ty) { + match type_name { + "Nullable" => { + return ClickHouseType::Nullable(Box::new(parse_ch_type(args_str))); + } + "LowCardinality" => { + return ClickHouseType::LowCardinality(Box::new(parse_ch_type(args_str))); + } + "Array" => { + return ClickHouseType::Array(Box::new(parse_ch_type(args_str))); + } + "Tuple" => { + let elements = parse_args(args_str) + .into_iter() + .map(|arg| parse_tuple_element(arg)) + .collect(); + return ClickHouseType::Tuple(elements); + } + "Map" => { + let args = parse_args(args_str); + if args.len() == 2 { + return ClickHouseType::Map( + Box::new(parse_ch_type(args[0])), + Box::new(parse_ch_type(args[1])), + ); + } + } + _ => {} // Fall through to primitive + } } - // Base case: return primitive type for anything without modifiers + // Base case: return primitive type ClickHouseType::Primitive(ty) } -/// Helper function to strip a wrapper from a type string. -/// Returns the inner content if the type matches the wrapper pattern. -fn strip_wrapper<'a>(ty: &'a str, wrapper_name: &str) -> Option<&'a str> { - ty.strip_prefix(wrapper_name)? - .trim_start() - .strip_prefix('(')? - .strip_suffix(')') -} - -/// Unwraps ClickHouse type modifiers like Nullable() and LowCardinality(). -/// Returns a tuple of (base_type, is_nullable). -/// For example: "LowCardinality(Nullable(String))" -> ("String", true) -pub fn unwrap_type_modifiers(ch_type: &str) -> (&str, bool) { - let parsed = parse_ch_type(ch_type); - let is_nullable = parsed.is_nullable(); +/// Helper: Finds the index of a delimiter, respecting nested parentheses/quotes. +fn find_delimiter(input: &str, delimiter: char) -> Option { + let mut depth = 0; + let mut in_quotes = false; - match parsed.base_type() { - ClickHouseType::Primitive(base) => (base, is_nullable), - _ => (ch_type, is_nullable), + for (i, c) in input.char_indices() { + match c { + '\'' => in_quotes = !in_quotes, + '(' if !in_quotes => depth += 1, + ')' if !in_quotes => depth -= 1, + c if c == delimiter && depth == 0 && !in_quotes => return Some(i), + _ => {} + } } + None } -fn unsupported(ch_type: &str, kind: &str) -> String { - format!( - "{kind} type '{ch_type}' is not supported. \ - ClickHouse {kind} types cannot be automatically converted to Arrow format." - ) +/// Parses a Tuple element which can be: +/// - Just a type: "String" -> (None, ClickHouseType::Primitive("String")) +/// - Named field: "category String" -> (Some("category"), ClickHouseType::Primitive("String")) +fn parse_tuple_element(element: &str) -> (Option<&str>, ClickHouseType<'_>) { + let element = element.trim(); + + // Use the helper to find the first space + if let Some(pos) = find_delimiter(element, ' ') { + let name = element[..pos].trim(); + let type_str = element[pos + 1..].trim(); + if !name.is_empty() && !type_str.is_empty() { + return (Some(name), parse_ch_type(type_str)); + } + } + + // No named field found, treat entire element as a type + (None, parse_ch_type(element)) } -/// Converts a ClickHouse type string to an Arrow DataType. -/// Returns a tuple of (DataType, is_nullable). -pub fn clickhouse_type_to_arrow(ch_type: &str) -> Result<(DataType, bool), String> { - let (base_type, is_nullable) = unwrap_type_modifiers(ch_type); - let (type_name, _) = extract_identifier(base_type); - - let data_type = match type_name { - // Numeric - "Int8" => DataType::Int8, - "Int16" => DataType::Int16, - "Int32" => DataType::Int32, - "Int64" => DataType::Int64, - "UInt8" => DataType::UInt8, - "UInt16" => DataType::UInt16, - "UInt32" => DataType::UInt32, - "UInt64" => DataType::UInt64, - "Float32" => DataType::Float32, - "Float64" => DataType::Float64, - "Bool" => DataType::Boolean, - "Decimal" | "Decimal32" | "Decimal64" | "Decimal128" | "Decimal256" => { - parse_decimal_type(base_type)? - } +/// Tries to parse "TypeName(args)" into ("TypeName", "args"). +fn try_parse_wrapper(ty: &str) -> Option<(&str, &str)> { + let paren_pos = ty.find('(')?; + if !ty.ends_with(')') { + return None; + } - // Strings - "String" | "FixedString" => DataType::Utf8, - - // Date and time types (timezones not currently handled, defaults to UTC) - "Date" | "Date32" => DataType::Date32, - "DateTime" => DataType::Timestamp(TimeUnit::Second, None), - "DateTime64" => parse_datetime64_precision(base_type)?, - - // Unsupported - "Array" => return Err(unsupported(ch_type, "Array")), - "Tuple" => return Err(unsupported(ch_type, "Tuple")), - "Map" => return Err(unsupported(ch_type, "Map")), - - // Unknown - _ => { - return Err(format!( - "Unknown ClickHouse type '{}'. This type cannot be automatically converted.", - type_name - )); - } + let type_name = ty[..paren_pos].trim(); + let args = &ty[paren_pos + 1..ty.len() - 1]; + + Some((type_name, args)) +} + +/// Parses comma-separated arguments, respecting nesting and quotes. +/// Handles input with or without surrounding parentheses. +/// Examples: "Int32, String" or "(Int32, String)" both work. +fn parse_args(input: &str) -> Vec<&str> { + let input = input.trim(); + + // Strip outer parens + let input = if input.starts_with('(') && input.ends_with(')') { + &input[1..input.len() - 1] + } else { + input }; - Ok((data_type, is_nullable)) + if input.is_empty() { + return vec![]; + } + + let mut args = Vec::new(); + let mut current = input; + + // Use the same helper to loop through commas + while let Some(pos) = find_delimiter(current, ',') { + args.push(current[..pos].trim()); + current = ¤t[pos + 1..]; + } + // Push the remainder + args.push(current.trim()); + + args } /// Extracts an identifier from the start of a string. @@ -145,52 +259,6 @@ fn extract_identifier(input: &str) -> (&str, &str) { (input, "") } -/// Parses comma-separated arguments from a parenthesized string. -/// Input: "(arg1, arg2, arg3)" -> Output: Ok(vec!["arg1".to_string(), "arg2".to_string(), "arg3".to_string()]) -/// Returns an error if parentheses are malformed. -fn parse_args(input: &str) -> Result, String> { - let trimmed = input.trim(); - if !trimmed.starts_with('(') || !trimmed.ends_with(')') { - return Err(format!( - "Expected parentheses around arguments in '{}'", - input - )); - } - - let inner = trimmed[1..trimmed.len() - 1].trim(); - if inner.is_empty() { - return Ok(vec![]); - } - - // Split by comma, handling nested parentheses and quotes - let mut args = Vec::new(); - let mut current_arg = String::new(); - let mut depth = 0; - let mut in_quotes = false; - - for c in inner.chars() { - match c { - '\'' if !in_quotes => in_quotes = true, - '\'' if in_quotes => in_quotes = false, - '(' if !in_quotes => depth += 1, - ')' if !in_quotes => depth -= 1, - ',' if depth == 0 && !in_quotes => { - args.push(current_arg.trim().to_string()); - current_arg = String::new(); - continue; - } - _ => {} - } - current_arg.push(c); - } - - if !current_arg.trim().is_empty() { - args.push(current_arg.trim().to_string()); - } - - Ok(args) -} - /// Parses ClickHouse Decimal types and returns the appropriate Arrow decimal type. /// ClickHouse formats: /// - Decimal(P, S) -> generic decimal with precision P and scale S @@ -204,7 +272,8 @@ fn parse_decimal_type(ch_type: &str) -> Result { // Parse from type string let (type_name, args_str) = extract_identifier(ch_type); - let result = parse_args(args_str).ok().and_then(|args| match type_name { + let args = parse_args(args_str); + let result = match type_name { "Decimal" if args.len() == 2 => args[0].parse::().ok().zip(args[1].parse::().ok()), "Decimal32" | "Decimal64" | "Decimal128" | "Decimal256" if args.len() == 1 => { args[0].parse::().ok().map(|scale| { @@ -219,7 +288,7 @@ fn parse_decimal_type(ch_type: &str) -> Result { }) } _ => None, - }); + }; result .map(|(precision, scale)| { @@ -242,12 +311,7 @@ fn parse_datetime64_precision(ch_type: &str) -> Result { // Parse from type string let (_type_name, args_str) = extract_identifier(ch_type); - let args = parse_args(args_str).map_err(|e| { - format!( - "Could not parse DateTime64 arguments from '{}': {}. Expected format: DateTime64(0-9) or DateTime64(0-9, 'timezone')", - ch_type, e - ) - })?; + let args = parse_args(args_str); // DateTime64(precision) or DateTime64(precision, 'timezone') if args.is_empty() { @@ -276,7 +340,7 @@ mod tests { // Helper function for tests that don't need metadata fn convert_type_no_metadata(ch_type: &str) -> Result<(DataType, bool), String> { - clickhouse_type_to_arrow(ch_type) + parse_ch_type(ch_type).to_arrow() } #[test] @@ -502,82 +566,87 @@ mod tests { #[test] fn test_parse_args() { - // Simple cases - assert_eq!( - parse_args("(10, 2)").unwrap(), - vec!["10".to_string(), "2".to_string()] - ); - assert_eq!(parse_args("(3)").unwrap(), vec!["3".to_string()]); - assert_eq!(parse_args("()").unwrap(), Vec::::new()); + // Simple cases with parentheses + assert_eq!(parse_args("(10, 2)"), vec!["10", "2"]); + assert_eq!(parse_args("(3)"), vec!["3"]); + assert_eq!(parse_args("()"), Vec::<&str>::new()); + + // Simple cases without parentheses (now supported) + assert_eq!(parse_args("10, 2"), vec!["10", "2"]); + assert_eq!(parse_args("3"), vec!["3"]); // With spaces - assert_eq!( - parse_args("( 10 , 2 )").unwrap(), - vec!["10".to_string(), "2".to_string()] - ); + assert_eq!(parse_args("( 10 , 2 )"), vec!["10", "2"]); // With nested parentheses + assert_eq!(parse_args("(Nullable(String))"), vec!["Nullable(String)"]); assert_eq!( - parse_args("(Nullable(String))").unwrap(), - vec!["Nullable(String)".to_string()] - ); - assert_eq!( - parse_args("(Array(Int32), String)").unwrap(), - vec!["Array(Int32)".to_string(), "String".to_string()] + parse_args("(Array(Int32), String)"), + vec!["Array(Int32)", "String"] ); // With quotes + assert_eq!(parse_args("(3, 'UTC')"), vec!["3", "'UTC'"]); assert_eq!( - parse_args("(3, 'UTC')").unwrap(), - vec!["3".to_string(), "'UTC'".to_string()] - ); - assert_eq!( - parse_args("(9, 'America/New_York')").unwrap(), - vec!["9".to_string(), "'America/New_York'".to_string()] + parse_args("(9, 'America/New_York')"), + vec!["9", "'America/New_York'"] ); - // Complex nested case + // Complex nested case with multiple levels, modifiers, named tuples, and quotes assert_eq!( - parse_args("(Tuple(Int32, String), Array(Float64))").unwrap(), + parse_args( + "(Array(Tuple(id Int64, tags Array(String))), Map(String, Tuple(Nullable(Float64), LowCardinality(String))), String, DateTime('America/New_York'))" + ), vec![ - "Tuple(Int32, String)".to_string(), - "Array(Float64)".to_string() + "Array(Tuple(id Int64, tags Array(String)))", + "Map(String, Tuple(Nullable(Float64), LowCardinality(String)))", + "String", + "DateTime('America/New_York')" ] ); - - // Error cases - assert!(parse_args("10, 2").is_err()); // Missing parentheses - assert!(parse_args("(10, 2").is_err()); // Missing closing paren } #[test] - fn test_array_type_not_supported() { - // Array types should return an error + fn test_array_type() { let result = convert_type_no_metadata("Array(Int32)"); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.contains("Array type")); - assert!(err.contains("not supported")); + assert!(result.is_ok()); + let (data_type, is_nullable) = result.unwrap(); + assert!(!is_nullable); + match data_type { + DataType::List(field) => { + assert_eq!(field.data_type(), &DataType::Int32); + assert!(!field.is_nullable()); + } + _ => panic!("Expected List type"), + } } #[test] - fn test_tuple_type_not_supported() { - // Tuple types should return an error + fn test_tuple_type() { let result = convert_type_no_metadata("Tuple(String, Int64)"); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.contains("Tuple type")); - assert!(err.contains("not supported")); + assert!(result.is_ok()); + let (data_type, is_nullable) = result.unwrap(); + assert!(!is_nullable); + match data_type { + DataType::Struct(fields) => { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].data_type(), &DataType::Utf8); + assert_eq!(fields[1].data_type(), &DataType::Int64); + } + _ => panic!("Expected Struct type"), + } } #[test] - fn test_map_type_not_supported() { - // Map types should return an error + fn test_map_type() { let result = convert_type_no_metadata("Map(String, Int64)"); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.contains("Map type")); - assert!(err.contains("not supported")); + assert!(result.is_ok()); + let (data_type, is_nullable) = result.unwrap(); + assert!(!is_nullable); + match data_type { + DataType::Map(_, _) => {} + _ => panic!("Expected Map type"), + } } #[test] @@ -644,4 +713,184 @@ mod tests { let parsed = parse_ch_type("String"); assert_eq!(parsed.base_type(), &ClickHouseType::Primitive("String")); } + + #[test] + fn test_array_type_parsing() { + // Simple array + let result = convert_type_no_metadata("Array(Int32)"); + assert!(result.is_ok()); + let (dtype, nullable) = result.unwrap(); + assert!(matches!(dtype, DataType::List(_))); + assert!(!nullable); + + // Nested array + let result = convert_type_no_metadata("Array(Array(String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::List(inner) = dtype { + assert!(matches!(inner.data_type(), DataType::List(_))); + } else { + panic!("Expected List type"); + } + + // Nullable array + let result = convert_type_no_metadata("Nullable(Array(Int64))"); + assert!(result.is_ok()); + let (_, nullable) = result.unwrap(); + assert!(nullable); + } + + #[test] + fn test_tuple_type_parsing() { + // Simple tuple + let result = convert_type_no_metadata("Tuple(String, Int64)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "f0"); + assert_eq!(fields[1].name(), "f1"); + assert!(matches!(fields[0].data_type(), DataType::Utf8)); + assert!(matches!(fields[1].data_type(), DataType::Int64)); + } else { + panic!("Expected Struct type"); + } + + // Nested tuple + let result = convert_type_no_metadata("Tuple(Int32, Tuple(String, Float64))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert!(matches!(fields[1].data_type(), DataType::Struct(_))); + } else { + panic!("Expected Struct type"); + } + } + + #[test] + fn test_map_type_parsing() { + // Simple map + let result = convert_type_no_metadata("Map(String, Int64)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + assert!(matches!(dtype, DataType::Map(_, _))); + + // Map with complex value + let result = convert_type_no_metadata("Map(String, Array(Int32))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Map(entries, _) = dtype + && let DataType::Struct(fields) = entries.data_type() + { + let value_field = &fields[1]; + assert!(matches!(value_field.data_type(), DataType::List(_))); + } + + // Non-string key should error + let result = convert_type_no_metadata("Map(Int32, String)"); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.contains("Map keys must be String")); + } + + #[test] + fn test_complex_nested_types() { + // Array of tuples + let result = convert_type_no_metadata("Array(Tuple(String, Int64))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::List(inner) = dtype { + assert!(matches!(inner.data_type(), DataType::Struct(_))); + } else { + panic!("Expected List type"); + } + + // Tuple with array and map + let result = convert_type_no_metadata("Tuple(Array(Int32), Map(String, Float64))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert!(matches!(fields[0].data_type(), DataType::List(_))); + assert!(matches!(fields[1].data_type(), DataType::Map(_, _))); + } else { + panic!("Expected Struct type"); + } + + // Map with tuple values + let result = convert_type_no_metadata("Map(String, Tuple(Int64, String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Map(entries, _) = dtype + && let DataType::Struct(fields) = entries.data_type() + { + let value_field = &fields[1]; + assert!(matches!(value_field.data_type(), DataType::Struct(_))); + } + } + + #[test] + fn test_named_tuple_fields() { + // Simple named tuple + let result = convert_type_no_metadata("Tuple(category String, tag String)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "category"); + assert_eq!(fields[1].name(), "tag"); + assert!(matches!(fields[0].data_type(), DataType::Utf8)); + assert!(matches!(fields[1].data_type(), DataType::Utf8)); + } else { + panic!("Expected Struct type"); + } + + // Array of named tuples (the original failing case) + let result = convert_type_no_metadata("Array(Tuple(category String, tag String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::List(inner) = dtype { + if let DataType::Struct(fields) = inner.data_type() { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "category"); + assert_eq!(fields[1].name(), "tag"); + assert!(matches!(fields[0].data_type(), DataType::Utf8)); + assert!(matches!(fields[1].data_type(), DataType::Utf8)); + } else { + panic!("Expected Struct type inside List"); + } + } else { + panic!("Expected List type"); + } + + // Mixed named and unnamed (named fields take precedence) + let result = convert_type_no_metadata("Tuple(id Int64, data String)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "id"); + assert_eq!(fields[1].name(), "data"); + assert!(matches!(fields[0].data_type(), DataType::Int64)); + assert!(matches!(fields[1].data_type(), DataType::Utf8)); + } else { + panic!("Expected Struct type"); + } + + // Named tuple with complex types + let result = + convert_type_no_metadata("Tuple(items Array(Int32), metadata Map(String, String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "items"); + assert_eq!(fields[1].name(), "metadata"); + assert!(matches!(fields[0].data_type(), DataType::List(_))); + assert!(matches!(fields[1].data_type(), DataType::Map(_, _))); + } else { + panic!("Expected Struct type"); + } + } } diff --git a/src/sinks/clickhouse/arrow/schema.rs b/src/sinks/clickhouse/arrow/schema.rs index f2359ca5f3519..fd9842b1bb371 100644 --- a/src/sinks/clickhouse/arrow/schema.rs +++ b/src/sinks/clickhouse/arrow/schema.rs @@ -9,7 +9,7 @@ use vector_lib::codecs::encoding::format::{ArrowEncodingError, SchemaProvider}; use crate::http::{Auth, HttpClient}; -use super::parser::clickhouse_type_to_arrow; +use super::parser::parse_ch_type; #[derive(Debug, Deserialize)] struct ColumnInfo { @@ -87,7 +87,8 @@ fn parse_schema_from_response(response: &str) -> crate::Result { let mut fields = Vec::new(); for column in columns { - let (arrow_type, nullable) = clickhouse_type_to_arrow(&column.column_type) + let (arrow_type, nullable) = parse_ch_type(&column.column_type) + .to_arrow() .map_err(|e| format!("Failed to convert column '{}': {}", column.name, e))?; fields.push(Field::new(&column.name, arrow_type, nullable)); } diff --git a/src/sinks/clickhouse/integration_tests.rs b/src/sinks/clickhouse/integration_tests.rs index 3798595708b41..a63e4a9e91e79 100644 --- a/src/sinks/clickhouse/integration_tests.rs +++ b/src/sinks/clickhouse/integration_tests.rs @@ -12,11 +12,12 @@ use futures::{ stream, }; use http::StatusCode; +use ordered_float::NotNan; use serde::Deserialize; use serde_json::Value; use tokio::time::{Duration, timeout}; use vector_lib::{ - codecs::encoding::BatchSerializerConfig, + codecs::encoding::{ArrowStreamSerializerConfig, BatchSerializerConfig}, event::{BatchNotifier, BatchStatus, BatchStatusReceiver, Event, LogEvent}, lookup::PathPrefix, }; @@ -605,3 +606,571 @@ async fn insert_events_arrow_with_schema_fetching() { assert!(row.get("active").and_then(|v| v.as_bool()).is_some()); } } + +#[tokio::test] +async fn test_complex_types() { + trace_init(); + + let table = random_table_name(); + let host = clickhouse_address(); + + let mut batch = BatchConfig::default(); + batch.max_events = Some(3); + + let arrow_config = ArrowStreamSerializerConfig { + allow_nullable_fields: true, + ..Default::default() + }; + + let config = ClickhouseConfig { + endpoint: host.parse().unwrap(), + table: table.clone().try_into().unwrap(), + compression: Compression::None, + format: crate::sinks::clickhouse::config::Format::ArrowStream, + batch_encoding: Some(BatchSerializerConfig::ArrowStream(arrow_config)), + batch, + request: TowerRequestConfig { + retry_attempts: 1, + ..Default::default() + }, + ..Default::default() + }; + + let client = ClickhouseClient::new(host); + + // Comprehensive schema with all complex types + client + .create_table( + &table, + "host String, timestamp DateTime64(3), message String, \ + nested_int_array Array(Array(Int32)), \ + nested_string_array Array(Array(String)), \ + array_map Map(String, Array(String)), \ + int_array_map Map(String, Array(Int64)), \ + tuple_with_array Tuple(String, Array(Int32)), \ + tuple_with_map Tuple(String, Map(String, Float64)), \ + tuple_with_nested Tuple(String, Array(Int32), Map(String, Float64)), \ + locations Array(Tuple(String, Float64, Float64)), \ + tags_history Array(Map(String, String)), \ + metrics_history Array(Map(String, Int32)), \ + request_headers Map(String, String), \ + response_metrics Tuple(Int32, Int64, Float64), \ + tags Array(String), \ + user_properties Map(String, Array(String)), \ + array_with_nulls Array(Nullable(Int32)), \ + array_with_named_tuple Array(Tuple(category String, tag String))", + ) + .await; + + let (sink, _hc) = config.build(SinkContext::default()).await.unwrap(); + + let mut events: Vec = Vec::new(); + + // Event 1: Comprehensive test with all complex types + let mut event1 = LogEvent::from("Comprehensive complex types test"); + event1.insert("host", "host1.example.com"); + + // Nested arrays + event1.insert( + "nested_int_array", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(1), + vector_lib::event::Value::Integer(2), + ]), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(3), + vector_lib::event::Value::Integer(4), + ]), + ]), + ); + event1.insert( + "nested_string_array", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("a".into()), + vector_lib::event::Value::Bytes("b".into()), + ])]), + ); + + // Maps with arrays + let mut array_map = vector_lib::event::ObjectMap::new(); + array_map.insert( + "fruits".into(), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("apple".into()), + vector_lib::event::Value::Bytes("banana".into()), + ]), + ); + event1.insert("array_map", vector_lib::event::Value::Object(array_map)); + + let mut int_array_map = vector_lib::event::ObjectMap::new(); + int_array_map.insert( + "scores".into(), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(95), + vector_lib::event::Value::Integer(87), + ]), + ); + event1.insert( + "int_array_map", + vector_lib::event::Value::Object(int_array_map), + ); + + // Tuples with complex types + let mut tuple_with_array = vector_lib::event::ObjectMap::new(); + tuple_with_array.insert( + "f0".into(), + vector_lib::event::Value::Bytes("numbers".into()), + ); + tuple_with_array.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(10), + vector_lib::event::Value::Integer(20), + ]), + ); + event1.insert( + "tuple_with_array", + vector_lib::event::Value::Object(tuple_with_array), + ); + + let mut inner_map = vector_lib::event::ObjectMap::new(); + inner_map.insert( + "temp".into(), + vector_lib::event::Value::Float(NotNan::new(22.5).unwrap()), + ); + let mut tuple_with_map = vector_lib::event::ObjectMap::new(); + tuple_with_map.insert( + "f0".into(), + vector_lib::event::Value::Bytes("metrics".into()), + ); + tuple_with_map.insert("f1".into(), vector_lib::event::Value::Object(inner_map)); + event1.insert( + "tuple_with_map", + vector_lib::event::Value::Object(tuple_with_map), + ); + + let mut inner_map2 = vector_lib::event::ObjectMap::new(); + inner_map2.insert( + "avg".into(), + vector_lib::event::Value::Float(NotNan::new(95.5).unwrap()), + ); + let mut tuple_complex = vector_lib::event::ObjectMap::new(); + tuple_complex.insert( + "f0".into(), + vector_lib::event::Value::Bytes("results".into()), + ); + tuple_complex.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(95)]), + ); + tuple_complex.insert("f2".into(), vector_lib::event::Value::Object(inner_map2)); + event1.insert( + "tuple_with_nested", + vector_lib::event::Value::Object(tuple_complex), + ); + + // Array of tuples + let mut loc1 = vector_lib::event::ObjectMap::new(); + loc1.insert( + "f0".into(), + vector_lib::event::Value::Bytes("San Francisco".into()), + ); + loc1.insert( + "f1".into(), + vector_lib::event::Value::Float(NotNan::new(37.7749).unwrap()), + ); + loc1.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(-122.4194).unwrap()), + ); + event1.insert( + "locations", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(loc1)]), + ); + + // Array of maps + let mut tags1 = vector_lib::event::ObjectMap::new(); + tags1.insert("env".into(), vector_lib::event::Value::Bytes("prod".into())); + event1.insert( + "tags_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(tags1)]), + ); + + let mut metrics1 = vector_lib::event::ObjectMap::new(); + metrics1.insert("cpu".into(), vector_lib::event::Value::Integer(45)); + event1.insert( + "metrics_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(metrics1)]), + ); + + // Structured log data + let mut headers = vector_lib::event::ObjectMap::new(); + headers.insert( + "user-agent".into(), + vector_lib::event::Value::Bytes("Mozilla/5.0".into()), + ); + event1.insert("request_headers", vector_lib::event::Value::Object(headers)); + + let mut metrics = vector_lib::event::ObjectMap::new(); + metrics.insert("f0".into(), vector_lib::event::Value::Integer(200)); + metrics.insert("f1".into(), vector_lib::event::Value::Integer(1234)); + metrics.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(0.145).unwrap()), + ); + event1.insert( + "response_metrics", + vector_lib::event::Value::Object(metrics), + ); + + event1.insert( + "tags", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("api".into()), + vector_lib::event::Value::Bytes("v2".into()), + ]), + ); + + let mut user_props = vector_lib::event::ObjectMap::new(); + user_props.insert( + "roles".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("admin".into())]), + ); + event1.insert( + "user_properties", + vector_lib::event::Value::Object(user_props), + ); + + // Nullable array + event1.insert( + "array_with_nulls", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(100), + vector_lib::event::Value::Integer(200), + ]), + ); + + // Named tuple array - tests that named fields work correctly + let mut named_tuple1 = vector_lib::event::ObjectMap::new(); + named_tuple1.insert( + "category".into(), + vector_lib::event::Value::Bytes("priority".into()), + ); + named_tuple1.insert("tag".into(), vector_lib::event::Value::Bytes("high".into())); + + let mut named_tuple2 = vector_lib::event::ObjectMap::new(); + named_tuple2.insert( + "category".into(), + vector_lib::event::Value::Bytes("environment".into()), + ); + named_tuple2.insert( + "tag".into(), + vector_lib::event::Value::Bytes("production".into()), + ); + + event1.insert( + "array_with_named_tuple", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Object(named_tuple1), + vector_lib::event::Value::Object(named_tuple2), + ]), + ); + + events.push(event1.into()); + + // Event 2: Empty and edge cases + let mut event2 = LogEvent::from("Test empty collections"); + event2.insert("host", "host2.example.com"); + event2.insert("nested_int_array", vector_lib::event::Value::Array(vec![])); + event2.insert( + "nested_string_array", + vector_lib::event::Value::Array(vec![]), + ); + + let empty_map = vector_lib::event::ObjectMap::new(); + event2.insert( + "array_map", + vector_lib::event::Value::Object(empty_map.clone()), + ); + event2.insert( + "int_array_map", + vector_lib::event::Value::Object(empty_map.clone()), + ); + + let mut empty_tuple = vector_lib::event::ObjectMap::new(); + empty_tuple.insert("f0".into(), vector_lib::event::Value::Bytes("empty".into())); + empty_tuple.insert("f1".into(), vector_lib::event::Value::Array(vec![])); + event2.insert( + "tuple_with_array", + vector_lib::event::Value::Object(empty_tuple), + ); + + let mut empty_tuple_map = vector_lib::event::ObjectMap::new(); + empty_tuple_map.insert("f0".into(), vector_lib::event::Value::Bytes("empty".into())); + empty_tuple_map.insert( + "f1".into(), + vector_lib::event::Value::Object(empty_map.clone()), + ); + event2.insert( + "tuple_with_map", + vector_lib::event::Value::Object(empty_tuple_map), + ); + + let mut empty_tuple_complex = vector_lib::event::ObjectMap::new(); + empty_tuple_complex.insert("f0".into(), vector_lib::event::Value::Bytes("empty".into())); + empty_tuple_complex.insert("f1".into(), vector_lib::event::Value::Array(vec![])); + empty_tuple_complex.insert( + "f2".into(), + vector_lib::event::Value::Object(empty_map.clone()), + ); + event2.insert( + "tuple_with_nested", + vector_lib::event::Value::Object(empty_tuple_complex), + ); + + event2.insert("locations", vector_lib::event::Value::Array(vec![])); + event2.insert("tags_history", vector_lib::event::Value::Array(vec![])); + event2.insert("metrics_history", vector_lib::event::Value::Array(vec![])); + event2.insert( + "request_headers", + vector_lib::event::Value::Object(empty_map.clone()), + ); + + let mut empty_metrics = vector_lib::event::ObjectMap::new(); + empty_metrics.insert("f0".into(), vector_lib::event::Value::Integer(0)); + empty_metrics.insert("f1".into(), vector_lib::event::Value::Integer(0)); + empty_metrics.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(0.0).unwrap()), + ); + event2.insert( + "response_metrics", + vector_lib::event::Value::Object(empty_metrics), + ); + + event2.insert("tags", vector_lib::event::Value::Array(vec![])); + event2.insert( + "user_properties", + vector_lib::event::Value::Object(empty_map), + ); + event2.insert("array_with_nulls", vector_lib::event::Value::Array(vec![])); + event2.insert( + "array_with_named_tuple", + vector_lib::event::Value::Array(vec![]), + ); + + events.push(event2.into()); + + // Event 3: More varied data + let mut event3 = LogEvent::from("Test varied data"); + event3.insert("host", "host3.example.com"); + + event3.insert( + "nested_int_array", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Array(vec![]), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(99)]), + ]), + ); + event3.insert( + "nested_string_array", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("test".into()), + ])]), + ); + + let mut map3 = vector_lib::event::ObjectMap::new(); + map3.insert( + "colors".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("red".into())]), + ); + event3.insert("array_map", vector_lib::event::Value::Object(map3)); + + let mut int_map3 = vector_lib::event::ObjectMap::new(); + int_map3.insert( + "values".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(42)]), + ); + event3.insert("int_array_map", vector_lib::event::Value::Object(int_map3)); + + let mut tuple3 = vector_lib::event::ObjectMap::new(); + tuple3.insert("f0".into(), vector_lib::event::Value::Bytes("data".into())); + tuple3.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(5)]), + ); + event3.insert("tuple_with_array", vector_lib::event::Value::Object(tuple3)); + + let mut map_inner = vector_lib::event::ObjectMap::new(); + map_inner.insert( + "val".into(), + vector_lib::event::Value::Float(NotNan::new(1.0).unwrap()), + ); + let mut tuple_map3 = vector_lib::event::ObjectMap::new(); + tuple_map3.insert("f0".into(), vector_lib::event::Value::Bytes("test".into())); + tuple_map3.insert("f1".into(), vector_lib::event::Value::Object(map_inner)); + event3.insert( + "tuple_with_map", + vector_lib::event::Value::Object(tuple_map3), + ); + + let mut map_inner2 = vector_lib::event::ObjectMap::new(); + map_inner2.insert( + "x".into(), + vector_lib::event::Value::Float(NotNan::new(2.0).unwrap()), + ); + let mut tuple_nested3 = vector_lib::event::ObjectMap::new(); + tuple_nested3.insert("f0".into(), vector_lib::event::Value::Bytes("nest".into())); + tuple_nested3.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(1)]), + ); + tuple_nested3.insert("f2".into(), vector_lib::event::Value::Object(map_inner2)); + event3.insert( + "tuple_with_nested", + vector_lib::event::Value::Object(tuple_nested3), + ); + + let mut loc3 = vector_lib::event::ObjectMap::new(); + loc3.insert("f0".into(), vector_lib::event::Value::Bytes("NYC".into())); + loc3.insert( + "f1".into(), + vector_lib::event::Value::Float(NotNan::new(40.7128).unwrap()), + ); + loc3.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(-74.0060).unwrap()), + ); + event3.insert( + "locations", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(loc3)]), + ); + + let mut tags3 = vector_lib::event::ObjectMap::new(); + tags3.insert("env".into(), vector_lib::event::Value::Bytes("dev".into())); + event3.insert( + "tags_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(tags3)]), + ); + + let mut metrics3 = vector_lib::event::ObjectMap::new(); + metrics3.insert("cpu".into(), vector_lib::event::Value::Integer(60)); + event3.insert( + "metrics_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(metrics3)]), + ); + + let mut headers3 = vector_lib::event::ObjectMap::new(); + headers3.insert( + "content-type".into(), + vector_lib::event::Value::Bytes("application/json".into()), + ); + event3.insert( + "request_headers", + vector_lib::event::Value::Object(headers3), + ); + + let mut metrics3_resp = vector_lib::event::ObjectMap::new(); + metrics3_resp.insert("f0".into(), vector_lib::event::Value::Integer(404)); + metrics3_resp.insert("f1".into(), vector_lib::event::Value::Integer(0)); + metrics3_resp.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(0.001).unwrap()), + ); + event3.insert( + "response_metrics", + vector_lib::event::Value::Object(metrics3_resp), + ); + + event3.insert( + "tags", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("test".into())]), + ); + + let mut user_props3 = vector_lib::event::ObjectMap::new(); + user_props3.insert( + "permissions".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("read".into())]), + ); + event3.insert( + "user_properties", + vector_lib::event::Value::Object(user_props3), + ); + + event3.insert( + "array_with_nulls", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(42)]), + ); + + // Named tuple with single element + let mut named_tuple3 = vector_lib::event::ObjectMap::new(); + named_tuple3.insert( + "category".into(), + vector_lib::event::Value::Bytes("status".into()), + ); + named_tuple3.insert( + "tag".into(), + vector_lib::event::Value::Bytes("active".into()), + ); + event3.insert( + "array_with_named_tuple", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(named_tuple3)]), + ); + + events.push(event3.into()); + + run_and_assert_sink_compliance(sink, stream::iter(events), &SINK_TAGS).await; + + let output = client.select_all(&table).await; + assert_eq!(3, output.rows); + + // Verify event 1 - comprehensive data + let row1 = &output.data[0]; + assert!( + row1.get("nested_int_array") + .and_then(|v| v.as_array()) + .is_some() + ); + assert!(row1.get("array_map").and_then(|v| v.as_object()).is_some()); + // Tuples are returned as arrays from ClickHouse + assert!( + row1.get("tuple_with_array") + .and_then(|v| v.as_array()) + .is_some() + ); + assert!(row1.get("locations").and_then(|v| v.as_array()).is_some()); + assert!( + row1.get("tags_history") + .and_then(|v| v.as_array()) + .is_some() + ); + assert!( + row1.get("request_headers") + .and_then(|v| v.as_object()) + .is_some() + ); + assert!( + row1.get("array_with_nulls") + .and_then(|v| v.as_array()) + .is_some() + ); + + // Verify event 2 - empty collections + let row2 = &output.data[1]; + let empty_nested = row2 + .get("nested_int_array") + .and_then(|v| v.as_array()) + .unwrap(); + assert_eq!(0, empty_nested.len()); + let empty_tags = row2.get("tags").and_then(|v| v.as_array()).unwrap(); + assert_eq!(0, empty_tags.len()); + + // Verify event 3 - varied data + let row3 = &output.data[2]; + let nested3 = row3 + .get("nested_int_array") + .and_then(|v| v.as_array()) + .unwrap(); + assert_eq!(2, nested3.len()); +} diff --git a/website/cue/reference/components/sinks/clickhouse.cue b/website/cue/reference/components/sinks/clickhouse.cue index 1049cf5217976..c0d80d1444388 100644 --- a/website/cue/reference/components/sinks/clickhouse.cue +++ b/website/cue/reference/components/sinks/clickhouse.cue @@ -142,9 +142,6 @@ components: sinks: clickhouse: { The following ClickHouse column types are **not yet supported** by Vector's ArrowStream implementation: - - `Array` - - `Tuple` - - `Map` - `IPv4` - `IPv6`