From 8850009875d4356eda7ad8e07e96fd85d45bcf53 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Tue, 23 Dec 2025 17:44:29 +0800 Subject: [PATCH 01/11] refactor: split arrow encoder into smaller files --- lib/codecs/src/encoding/format/arrow.rs | 1671 ----------------- .../src/encoding/format/arrow/builder.rs | 66 + lib/codecs/src/encoding/format/arrow/mod.rs | 254 +++ lib/codecs/src/encoding/format/arrow/tests.rs | 1011 ++++++++++ .../encoding/format/arrow/types/decimal.rs | 116 ++ .../src/encoding/format/arrow/types/mod.rs | 11 + .../encoding/format/arrow/types/primitives.rs | 187 ++ .../encoding/format/arrow/types/temporal.rs | 85 + 8 files changed, 1730 insertions(+), 1671 deletions(-) delete mode 100644 lib/codecs/src/encoding/format/arrow.rs create mode 100644 lib/codecs/src/encoding/format/arrow/builder.rs create mode 100644 lib/codecs/src/encoding/format/arrow/mod.rs create mode 100644 lib/codecs/src/encoding/format/arrow/tests.rs create mode 100644 lib/codecs/src/encoding/format/arrow/types/decimal.rs create mode 100644 lib/codecs/src/encoding/format/arrow/types/mod.rs create mode 100644 lib/codecs/src/encoding/format/arrow/types/primitives.rs create mode 100644 lib/codecs/src/encoding/format/arrow/types/temporal.rs diff --git a/lib/codecs/src/encoding/format/arrow.rs b/lib/codecs/src/encoding/format/arrow.rs deleted file mode 100644 index 3c2d3863f1fb2..0000000000000 --- a/lib/codecs/src/encoding/format/arrow.rs +++ /dev/null @@ -1,1671 +0,0 @@ -//! Arrow IPC streaming format codec for batched event encoding -//! -//! Provides Apache Arrow IPC stream format encoding with static schema support. -//! This implements the streaming variant of the Arrow IPC protocol, which writes -//! a continuous stream of record batches without a file footer. - -use arrow::{ - array::{ - ArrayRef, BinaryBuilder, BooleanBuilder, Decimal128Builder, Decimal256Builder, - Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, - StringBuilder, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, - TimestampNanosecondBuilder, TimestampSecondBuilder, UInt8Builder, UInt16Builder, - UInt32Builder, UInt64Builder, - }, - datatypes::{DataType, Schema, TimeUnit, i256}, - ipc::writer::StreamWriter, - record_batch::RecordBatch, -}; -use async_trait::async_trait; -use bytes::{BufMut, Bytes, BytesMut}; -use chrono::{DateTime, Utc}; -use rust_decimal::Decimal; -use snafu::Snafu; -use std::sync::Arc; -use vector_config::configurable_component; - -use vector_core::event::{Event, Value}; - -/// Provides Arrow schema for encoding. -/// -/// Sinks can implement this trait to provide custom schema fetching logic. -#[async_trait] -pub trait SchemaProvider: Send + Sync + std::fmt::Debug { - /// Fetch the Arrow schema from the data store. - /// - /// This is called during sink configuration build phase to fetch - /// the schema once at startup, rather than at runtime. - async fn get_schema(&self) -> Result; -} - -/// Configuration for Arrow IPC stream serialization -#[configurable_component] -#[derive(Clone, Default)] -pub struct ArrowStreamSerializerConfig { - /// The Arrow schema to use for encoding - #[serde(skip)] - #[configurable(derived)] - pub schema: Option, - - /// Allow null values for non-nullable fields in the schema. - /// - /// When enabled, missing or incompatible values will be encoded as null even for fields - /// marked as non-nullable in the Arrow schema. This is useful when working with downstream - /// systems that can handle null values through defaults, computed columns, or other mechanisms. - /// - /// When disabled (default), missing values for non-nullable fields will cause encoding errors, - /// ensuring all required data is present before sending to the sink. - #[serde(default)] - #[configurable(derived)] - pub allow_nullable_fields: bool, -} - -impl std::fmt::Debug for ArrowStreamSerializerConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ArrowStreamSerializerConfig") - .field( - "schema", - &self - .schema - .as_ref() - .map(|s| format!("{} fields", s.fields().len())), - ) - .field("allow_nullable_fields", &self.allow_nullable_fields) - .finish() - } -} - -impl ArrowStreamSerializerConfig { - /// Create a new ArrowStreamSerializerConfig with a schema - pub fn new(schema: arrow::datatypes::Schema) -> Self { - Self { - schema: Some(schema), - allow_nullable_fields: false, - } - } - - /// The data type of events that are accepted by `ArrowStreamEncoder`. - pub fn input_type(&self) -> vector_core::config::DataType { - vector_core::config::DataType::Log - } - - /// The schema required by the serializer. - pub fn schema_requirement(&self) -> vector_core::schema::Requirement { - vector_core::schema::Requirement::empty() - } -} - -/// Arrow IPC stream batch serializer that holds the schema -#[derive(Clone, Debug)] -pub struct ArrowStreamSerializer { - schema: Arc, -} - -impl ArrowStreamSerializer { - /// Create a new ArrowStreamSerializer with the given configuration - pub fn new(config: ArrowStreamSerializerConfig) -> Result { - let schema = config - .schema - .ok_or_else(|| vector_common::Error::from("Arrow serializer requires a schema."))?; - - // If allow_nullable_fields is enabled, transform the schema once here - // instead of on every batch encoding - let schema = if config.allow_nullable_fields { - Schema::new_with_metadata( - schema - .fields() - .iter() - .map(|f| Arc::new(make_field_nullable(f))) - .collect::>(), - schema.metadata().clone(), - ) - } else { - schema - }; - - Ok(Self { - schema: Arc::new(schema), - }) - } -} - -impl tokio_util::codec::Encoder> for ArrowStreamSerializer { - type Error = ArrowEncodingError; - - fn encode(&mut self, events: Vec, buffer: &mut BytesMut) -> Result<(), Self::Error> { - if events.is_empty() { - return Err(ArrowEncodingError::NoEvents); - } - - let bytes = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&self.schema)))?; - - buffer.extend_from_slice(&bytes); - Ok(()) - } -} - -/// Errors that can occur during Arrow encoding -#[derive(Debug, Snafu)] -pub enum ArrowEncodingError { - /// Failed to create Arrow record batch - #[snafu(display("Failed to create Arrow record batch: {}", source))] - RecordBatchCreation { - /// The underlying Arrow error - source: arrow::error::ArrowError, - }, - - /// Failed to write Arrow IPC data - #[snafu(display("Failed to write Arrow IPC data: {}", source))] - IpcWrite { - /// The underlying Arrow error - source: arrow::error::ArrowError, - }, - - /// No events provided for encoding - #[snafu(display("No events provided for encoding"))] - NoEvents, - - /// Schema must be provided before encoding - #[snafu(display("Schema must be provided before encoding"))] - NoSchemaProvided, - - /// Failed to fetch schema from provider - #[snafu(display("Failed to fetch schema from provider: {}", message))] - SchemaFetchError { - /// Error message from the provider - message: String, - }, - - /// Unsupported Arrow data type for field - #[snafu(display( - "Unsupported Arrow data type for field '{}': {:?}", - field_name, - data_type - ))] - UnsupportedType { - /// The field name - field_name: String, - /// The unsupported data type - data_type: DataType, - }, - - /// Null value encountered for non-nullable field - #[snafu(display("Null value for non-nullable field '{}'", field_name))] - NullConstraint { - /// The field name - field_name: String, - }, - - /// IO error during encoding - #[snafu(display("IO error: {}", source))] - Io { - /// The underlying IO error - source: std::io::Error, - }, -} - -impl From for ArrowEncodingError { - fn from(error: std::io::Error) -> Self { - Self::Io { source: error } - } -} - -/// Encodes a batch of events into Arrow IPC streaming format -pub fn encode_events_to_arrow_ipc_stream( - events: &[Event], - schema: Option>, -) -> Result { - if events.is_empty() { - return Err(ArrowEncodingError::NoEvents); - } - - let schema_ref = schema.ok_or(ArrowEncodingError::NoSchemaProvided)?; - - let record_batch = build_record_batch(schema_ref, events)?; - - let ipc_err = |source| ArrowEncodingError::IpcWrite { source }; - - let mut buffer = BytesMut::new().writer(); - let mut writer = - StreamWriter::try_new(&mut buffer, record_batch.schema_ref()).map_err(ipc_err)?; - writer.write(&record_batch).map_err(ipc_err)?; - writer.finish().map_err(ipc_err)?; - - Ok(buffer.into_inner().freeze()) -} - -/// Recursively makes a Field and all its nested fields nullable -fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field { - let new_data_type = match field.data_type() { - DataType::List(inner_field) => DataType::List(Arc::new(make_field_nullable(inner_field))), - DataType::Struct(fields) => { - DataType::Struct(fields.iter().map(|f| make_field_nullable(f)).collect()) - } - DataType::Map(inner_field, sorted) => { - DataType::Map(Arc::new(make_field_nullable(inner_field)), *sorted) - } - other => other.clone(), - }; - - field - .clone() - .with_data_type(new_data_type) - .with_nullable(true) -} - -/// Builds an Arrow RecordBatch from events -fn build_record_batch( - schema: Arc, - events: &[Event], -) -> Result { - let num_fields = schema.fields().len(); - let mut columns: Vec = Vec::with_capacity(num_fields); - - for field in schema.fields() { - let field_name = field.name(); - let nullable = field.is_nullable(); - let array: ArrayRef = match field.data_type() { - DataType::Timestamp(time_unit, _) => { - build_timestamp_array(events, field_name, *time_unit, nullable)? - } - DataType::Utf8 => build_string_array(events, field_name, nullable)?, - DataType::Int8 => build_int8_array(events, field_name, nullable)?, - DataType::Int16 => build_int16_array(events, field_name, nullable)?, - DataType::Int32 => build_int32_array(events, field_name, nullable)?, - DataType::Int64 => build_int64_array(events, field_name, nullable)?, - DataType::UInt8 => build_uint8_array(events, field_name, nullable)?, - DataType::UInt16 => build_uint16_array(events, field_name, nullable)?, - DataType::UInt32 => build_uint32_array(events, field_name, nullable)?, - DataType::UInt64 => build_uint64_array(events, field_name, nullable)?, - DataType::Float32 => build_float32_array(events, field_name, nullable)?, - DataType::Float64 => build_float64_array(events, field_name, nullable)?, - DataType::Boolean => build_boolean_array(events, field_name, nullable)?, - DataType::Binary => build_binary_array(events, field_name, nullable)?, - DataType::Decimal128(precision, scale) => { - build_decimal128_array(events, field_name, *precision, *scale, nullable)? - } - DataType::Decimal256(precision, scale) => { - build_decimal256_array(events, field_name, *precision, *scale, nullable)? - } - other_type => { - return Err(ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: other_type.clone(), - }); - } - }; - - columns.push(array); - } - - RecordBatch::try_new(schema, columns) - .map_err(|source| ArrowEncodingError::RecordBatchCreation { source }) -} - -/// Macro to handle appending null or returning an error for non-nullable fields. -macro_rules! handle_null_constraints { - ($builder:expr, $nullable:expr, $field_name:expr) => {{ - if !$nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: $field_name.into(), - }); - } - $builder.append_null(); - }}; -} - -/// Macro to generate a `build_*_array` function for primitive types. -macro_rules! define_build_primitive_array_fn { - ( - $fn_name:ident, // The function name (e.g., build_int8_array) - $builder_ty:ty, // The builder type (e.g., Int8Builder) - // One or more match arms for valid Value types - $( $value_pat:pat $(if $guard:expr)? => $append_expr:expr ),+ - ) => { - fn $fn_name( - events: &[Event], - field_name: &str, - nullable: bool, - ) -> Result { - let mut builder = <$builder_ty>::with_capacity(events.len()); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - $( - $value_pat $(if $guard)? => builder.append_value($append_expr), - )+ - // All other patterns are treated as null/invalid - _ => handle_null_constraints!(builder, nullable, field_name), - } - } - } - Ok(Arc::new(builder.finish())) - } - }; -} - -fn extract_timestamp(value: &Value) -> Option> { - match value { - Value::Timestamp(ts) => Some(*ts), - Value::Bytes(bytes) => std::str::from_utf8(bytes) - .ok() - .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok()) - .map(|dt| dt.with_timezone(&Utc)), - _ => None, - } -} - -fn build_timestamp_array( - events: &[Event], - field_name: &str, - time_unit: TimeUnit, - nullable: bool, -) -> Result { - macro_rules! build_array { - ($builder:ty, $converter:expr) => {{ - let mut builder = <$builder>::with_capacity(events.len()); - for event in events { - if let Event::Log(log) = event { - let value_to_append = log.get(field_name).and_then(|value| { - // First, try to extract it as a native or string timestamp - if let Some(ts) = extract_timestamp(value) { - $converter(&ts) - } - // Else, fall back to a raw integer - else if let Value::Integer(i) = value { - Some(*i) - } - // Else, it's an unsupported type (e.g., Bool, Float) - else { - None - } - }); - - if value_to_append.is_none() && !nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: field_name.into(), - }); - } - - builder.append_option(value_to_append); - } - } - Ok(Arc::new(builder.finish())) - }}; - } - - match time_unit { - TimeUnit::Second => { - build_array!(TimestampSecondBuilder, |ts: &DateTime| Some( - ts.timestamp() - )) - } - TimeUnit::Millisecond => { - build_array!(TimestampMillisecondBuilder, |ts: &DateTime| Some( - ts.timestamp_millis() - )) - } - TimeUnit::Microsecond => { - build_array!(TimestampMicrosecondBuilder, |ts: &DateTime| Some( - ts.timestamp_micros() - )) - } - TimeUnit::Nanosecond => { - build_array!(TimestampNanosecondBuilder, |ts: &DateTime| ts - .timestamp_nanos_opt()) - } - } -} - -fn build_string_array( - events: &[Event], - field_name: &str, - nullable: bool, -) -> Result { - let mut builder = StringBuilder::with_capacity(events.len(), 0); - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - if let Some(value) = log.get(field_name) { - match value { - Value::Bytes(bytes) => { - // Attempt direct UTF-8 conversion first, fallback to lossy - match std::str::from_utf8(bytes) { - Ok(s) => builder.append_value(s), - Err(_) => builder.append_value(&String::from_utf8_lossy(bytes)), - } - appended = true; - } - Value::Object(obj) => { - if let Ok(s) = serde_json::to_string(&obj) { - builder.append_value(s); - appended = true; - } - } - Value::Array(arr) => { - if let Ok(s) = serde_json::to_string(&arr) { - builder.append_value(s); - appended = true; - } - } - _ => { - builder.append_value(&value.to_string_lossy()); - appended = true; - } - } - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -define_build_primitive_array_fn!( - build_int8_array, - Int8Builder, - Some(Value::Integer(i)) if *i >= i8::MIN as i64 && *i <= i8::MAX as i64 => *i as i8 -); - -define_build_primitive_array_fn!( - build_int16_array, - Int16Builder, - Some(Value::Integer(i)) if *i >= i16::MIN as i64 && *i <= i16::MAX as i64 => *i as i16 -); - -define_build_primitive_array_fn!( - build_int32_array, - Int32Builder, - Some(Value::Integer(i)) if *i >= i32::MIN as i64 && *i <= i32::MAX as i64 => *i as i32 -); - -define_build_primitive_array_fn!( - build_int64_array, - Int64Builder, - Some(Value::Integer(i)) => *i -); - -define_build_primitive_array_fn!( - build_uint8_array, - UInt8Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u8::MAX as i64 => *i as u8 -); - -define_build_primitive_array_fn!( - build_uint16_array, - UInt16Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u16::MAX as i64 => *i as u16 -); - -define_build_primitive_array_fn!( - build_uint32_array, - UInt32Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u32::MAX as i64 => *i as u32 -); - -define_build_primitive_array_fn!( - build_uint64_array, - UInt64Builder, - Some(Value::Integer(i)) if *i >= 0 => *i as u64 -); - -define_build_primitive_array_fn!( - build_float32_array, - Float32Builder, - Some(Value::Float(f)) => f.into_inner() as f32, - Some(Value::Integer(i)) => *i as f32 -); - -define_build_primitive_array_fn!( - build_float64_array, - Float64Builder, - Some(Value::Float(f)) => f.into_inner(), - Some(Value::Integer(i)) => *i as f64 -); - -define_build_primitive_array_fn!( - build_boolean_array, - BooleanBuilder, - Some(Value::Boolean(b)) => *b -); - -fn build_binary_array( - events: &[Event], - field_name: &str, - nullable: bool, -) -> Result { - let mut builder = BinaryBuilder::with_capacity(events.len(), 0); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - Some(Value::Bytes(bytes)) => builder.append_value(bytes), - _ => handle_null_constraints!(builder, nullable, field_name), - } - } - } - - Ok(Arc::new(builder.finish())) -} - -fn build_decimal128_array( - events: &[Event], - field_name: &str, - precision: u8, - scale: i8, - nullable: bool, -) -> Result { - let mut builder = Decimal128Builder::with_capacity(events.len()) - .with_precision_and_scale(precision, scale) - .map_err(|_| ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: DataType::Decimal128(precision, scale), - })?; - - let target_scale = scale.unsigned_abs() as u32; - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - match log.get(field_name) { - Some(Value::Float(f)) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(mantissa); - appended = true; - } - } - Some(Value::Integer(i)) => { - let mut decimal = Decimal::from(*i); - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(mantissa); - appended = true; - } - _ => {} - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -fn build_decimal256_array( - events: &[Event], - field_name: &str, - precision: u8, - scale: i8, - nullable: bool, -) -> Result { - let mut builder = Decimal256Builder::with_capacity(events.len()) - .with_precision_and_scale(precision, scale) - .map_err(|_| ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: DataType::Decimal256(precision, scale), - })?; - - let target_scale = scale.unsigned_abs() as u32; - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - match log.get(field_name) { - Some(Value::Float(f)) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - // rust_decimal does not support i256 natively so we upcast here - builder.append_value(i256::from_i128(mantissa)); - appended = true; - } - } - Some(Value::Integer(i)) => { - let mut decimal = Decimal::from(*i); - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(i256::from_i128(mantissa)); - appended = true; - } - _ => {} - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::{ - array::{ - Array, BinaryArray, BooleanArray, Float64Array, Int64Array, StringArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, - }, - datatypes::Field, - ipc::reader::StreamReader, - }; - use chrono::Utc; - use std::io::Cursor; - use vector_core::event::LogEvent; - - #[test] - fn test_encode_all_types() { - let mut log = LogEvent::default(); - log.insert("string_field", "test"); - log.insert("int8_field", 127); - log.insert("int16_field", 32000); - log.insert("int32_field", 1000000); - log.insert("int64_field", 42); - log.insert("float32_field", 3.15); - log.insert("float64_field", 3.15); - log.insert("bool_field", true); - log.insert("bytes_field", bytes::Bytes::from("binary")); - log.insert("timestamp_field", Utc::now()); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("string_field", DataType::Utf8, true), - Field::new("int8_field", DataType::Int8, true), - Field::new("int16_field", DataType::Int16, true), - Field::new("int32_field", DataType::Int32, true), - Field::new("int64_field", DataType::Int64, true), - Field::new("float32_field", DataType::Float32, true), - Field::new("float64_field", DataType::Float64, true), - Field::new("bool_field", DataType::Boolean, true), - Field::new("bytes_field", DataType::Binary, true), - Field::new( - "timestamp_field", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 10); - - // Verify string field - assert_eq!( - batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - "test" - ); - - // Verify int8 field - assert_eq!( - batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 127 - ); - - // Verify int16 field - assert_eq!( - batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 32000 - ); - - // Verify int32 field - assert_eq!( - batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 1000000 - ); - - // Verify int64 field - assert_eq!( - batch - .column(4) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 42 - ); - - // Verify float32 field - assert!( - (batch - .column(5) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - - 3.15) - .abs() - < 0.001 - ); - - // Verify float64 field - assert!( - (batch - .column(6) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - - 3.15) - .abs() - < 0.001 - ); - - // Verify boolean field - assert!( - batch - .column(7) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - "{}", - true - ); - - // Verify binary field - assert_eq!( - batch - .column(8) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - b"binary" - ); - - // Verify timestamp field - assert!( - !batch - .column(9) - .as_any() - .downcast_ref::() - .unwrap() - .is_null(0) - ); - } - - #[test] - fn test_encode_null_values() { - let mut log1 = LogEvent::default(); - log1.insert("field_a", 1); - // field_b is missing - - let mut log2 = LogEvent::default(); - log2.insert("field_b", 2); - // field_a is missing - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("field_a", DataType::Int64, true), - Field::new("field_b", DataType::Int64, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 2); - - let field_a = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(field_a.value(0), 1); - assert!(field_a.is_null(1)); - - let field_b = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(field_b.is_null(0)); - assert_eq!(field_b.value(1), 2); - } - - #[test] - fn test_encode_type_mismatches() { - let mut log1 = LogEvent::default(); - log1.insert("field", 42); // Integer - - let mut log2 = LogEvent::default(); - log2.insert("field", 3.15); // Float - type mismatch! - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - // Schema expects Int64 - let schema = Arc::new(Schema::new(vec![Field::new( - "field", - DataType::Int64, - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 2); - - let field_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(field_array.value(0), 42); - assert!(field_array.is_null(1)); // Type mismatch becomes null - } - - #[test] - fn test_encode_complex_json_values() { - use serde_json::json; - - let mut log = LogEvent::default(); - log.insert( - "object_field", - json!({"key": "value", "nested": {"count": 42}}), - ); - log.insert("array_field", json!([1, 2, 3])); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("object_field", DataType::Utf8, true), - Field::new("array_field", DataType::Utf8, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - - let object_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let object_str = object_array.value(0); - assert!(object_str.contains("key")); - assert!(object_str.contains("value")); - - let array_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - let array_str = array_array.value(0); - assert_eq!(array_str, "[1,2,3]"); - } - - #[test] - fn test_encode_unsupported_type() { - let mut log = LogEvent::default(); - log.insert("field", "value"); - - let events = vec![Event::Log(log)]; - - // Use an unsupported type - let schema = Arc::new(Schema::new(vec![Field::new( - "field", - DataType::Duration(TimeUnit::Millisecond), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - ArrowEncodingError::UnsupportedType { .. } - )); - } - - #[test] - fn test_encode_without_schema_fails() { - let mut log1 = LogEvent::default(); - log1.insert("message", "hello"); - - let events = vec![Event::Log(log1)]; - - let result = encode_events_to_arrow_ipc_stream(&events, None); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - ArrowEncodingError::NoSchemaProvided - )); - } - - #[test] - fn test_encode_empty_events() { - let events: Vec = vec![]; - let result = encode_events_to_arrow_ipc_stream(&events, None); - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), ArrowEncodingError::NoEvents)); - } - - #[test] - fn test_encode_timestamp_precisions() { - let now = Utc::now(); - let mut log = LogEvent::default(); - log.insert("ts_second", now); - log.insert("ts_milli", now); - log.insert("ts_micro", now); - log.insert("ts_nano", now); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new( - "ts_second", - DataType::Timestamp(TimeUnit::Second, None), - true, - ), - Field::new( - "ts_milli", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new( - "ts_micro", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - ), - Field::new( - "ts_nano", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 4); - - let ts_second = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_second.is_null(0)); - assert_eq!(ts_second.value(0), now.timestamp()); - - let ts_milli = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_milli.is_null(0)); - assert_eq!(ts_milli.value(0), now.timestamp_millis()); - - let ts_micro = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_micro.is_null(0)); - assert_eq!(ts_micro.value(0), now.timestamp_micros()); - - let ts_nano = batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_nano.is_null(0)); - assert_eq!(ts_nano.value(0), now.timestamp_nanos_opt().unwrap()); - } - - #[test] - fn test_encode_mixed_timestamp_string_and_native() { - // Test mixing string timestamps with native Timestamp values - let mut log1 = LogEvent::default(); - log1.insert("ts", "2025-10-22T10:18:44.256Z"); // String - - let mut log2 = LogEvent::default(); - log2.insert("ts", Utc::now()); // Native Timestamp - - let mut log3 = LogEvent::default(); - log3.insert("ts", 1729594724256000000_i64); // Integer (nanoseconds) - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "ts", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let ts_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // All three should be non-null - assert!(!ts_array.is_null(0)); - assert!(!ts_array.is_null(1)); - assert!(!ts_array.is_null(2)); - - // First one should match the parsed string - let expected = chrono::DateTime::parse_from_rfc3339("2025-10-22T10:18:44.256Z") - .unwrap() - .timestamp_nanos_opt() - .unwrap(); - assert_eq!(ts_array.value(0), expected); - - // Third one should match the integer - assert_eq!(ts_array.value(2), 1729594724256000000_i64); - } - - #[test] - fn test_encode_invalid_string_timestamp() { - // Test that invalid timestamp strings become null - let mut log1 = LogEvent::default(); - log1.insert("timestamp", "not-a-timestamp"); - - let mut log2 = LogEvent::default(); - log2.insert("timestamp", "2025-10-22T10:18:44.256Z"); // Valid - - let mut log3 = LogEvent::default(); - log3.insert("timestamp", "2025-99-99T99:99:99Z"); // Invalid - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "timestamp", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let ts_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // Invalid timestamps should be null - assert!(ts_array.is_null(0)); - assert!(!ts_array.is_null(1)); // Valid one - assert!(ts_array.is_null(2)); - } - - #[test] - fn test_encode_decimal128_from_integer() { - use arrow::array::Decimal128Array; - - let mut log = LogEvent::default(); - // Store quantity as integer: 1000 - log.insert("quantity", 1000_i64); - - let events = vec![Event::Log(log)]; - - // Decimal(10, 3) - will represent 1000 as 1000.000 - let schema = Arc::new(Schema::new(vec![Field::new( - "quantity", - DataType::Decimal128(10, 3), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert!(!decimal_array.is_null(0)); - // 1000 with scale 3 = 1000 * 10^3 = 1000000 - assert_eq!(decimal_array.value(0), 1000000_i128); - } - - #[test] - fn test_encode_decimal256() { - use arrow::array::Decimal256Array; - - let mut log = LogEvent::default(); - // Very large precision number - log.insert("big_value", 123456789.123456_f64); - - let events = vec![Event::Log(log)]; - - // Decimal256(50, 6) - high precision decimal - let schema = Arc::new(Schema::new(vec![Field::new( - "big_value", - DataType::Decimal256(50, 6), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert!(!decimal_array.is_null(0)); - // Value should be non-null and encoded - let value = decimal_array.value(0); - assert!(value.to_i128().is_some()); - } - - #[test] - fn test_encode_decimal_null_values() { - use arrow::array::Decimal128Array; - - let mut log1 = LogEvent::default(); - log1.insert("price", 99.99_f64); - - let log2 = LogEvent::default(); - // No price field - should be null - - let mut log3 = LogEvent::default(); - log3.insert("price", 50.00_f64); - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "price", - DataType::Decimal128(10, 2), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // First row: 99.99 - assert!(!decimal_array.is_null(0)); - assert_eq!(decimal_array.value(0), 9999_i128); - - // Second row: null - assert!(decimal_array.is_null(1)); - - // Third row: 50.00 - assert!(!decimal_array.is_null(2)); - assert_eq!(decimal_array.value(2), 5000_i128); - } - - #[test] - fn test_encode_unsigned_integer_types() { - use arrow::array::{UInt8Array, UInt16Array, UInt32Array, UInt64Array}; - - let mut log = LogEvent::default(); - log.insert("uint8_field", 255_i64); - log.insert("uint16_field", 65535_i64); - log.insert("uint32_field", 4294967295_i64); - log.insert("uint64_field", 9223372036854775807_i64); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("uint8_field", DataType::UInt8, true), - Field::new("uint16_field", DataType::UInt16, true), - Field::new("uint32_field", DataType::UInt32, true), - Field::new("uint64_field", DataType::UInt64, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 4); - - // Verify uint8 - let uint8_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint8_array.value(0), 255_u8); - - // Verify uint16 - let uint16_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint16_array.value(0), 65535_u16); - - // Verify uint32 - let uint32_array = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint32_array.value(0), 4294967295_u32); - - // Verify uint64 - let uint64_array = batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint64_array.value(0), 9223372036854775807_u64); - } - - #[test] - fn test_encode_unsigned_integers_with_null_and_overflow() { - use arrow::array::{UInt8Array, UInt32Array}; - - let mut log1 = LogEvent::default(); - log1.insert("uint8_field", 100_i64); - log1.insert("uint32_field", 1000_i64); - - let mut log2 = LogEvent::default(); - log2.insert("uint8_field", 300_i64); // Overflow - should be null - log2.insert("uint32_field", -1_i64); // Negative - should be null - - let log3 = LogEvent::default(); - // Missing fields - should be null - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("uint8_field", DataType::UInt8, true), - Field::new("uint32_field", DataType::UInt32, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - // Check uint8 column - let uint8_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint8_array.value(0), 100_u8); // Valid - assert!(uint8_array.is_null(1)); // Overflow - assert!(uint8_array.is_null(2)); // Missing - - // Check uint32 column - let uint32_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint32_array.value(0), 1000_u32); // Valid - assert!(uint32_array.is_null(1)); // Negative - assert!(uint32_array.is_null(2)); // Missing - } - - #[test] - fn test_encode_non_nullable_field_with_null_value() { - // Test that encoding fails when a non-nullable field encounters a null value - let mut log1 = LogEvent::default(); - log1.insert("required_field", 42); - - let log2 = LogEvent::default(); - // log2 is missing required_field - should cause an error - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - // Create schema with non-nullable field - let schema = Arc::new(Schema::new(vec![Field::new( - "required_field", - DataType::Int64, - false, // Not nullable - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - - match result.unwrap_err() { - ArrowEncodingError::NullConstraint { field_name } => { - assert_eq!(field_name, "required_field"); - } - other => panic!("Expected NullConstraint error, got: {:?}", other), - } - } - - #[test] - fn test_encode_non_nullable_string_field_with_missing_value() { - // Test that encoding fails for non-nullable string field - let mut log1 = LogEvent::default(); - log1.insert("name", "Alice"); - - let mut log2 = LogEvent::default(); - log2.insert("name", "Bob"); - - let log3 = LogEvent::default(); - // log3 is missing name field - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "name", - DataType::Utf8, - false, // Not nullable - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - - match result.unwrap_err() { - ArrowEncodingError::NullConstraint { field_name } => { - assert_eq!(field_name, "name"); - } - other => panic!("Expected NullConstraint error, got: {:?}", other), - } - } - - #[test] - fn test_encode_non_nullable_field_all_values_present() { - // Test that encoding succeeds when all values are present for non-nullable field - let mut log1 = LogEvent::default(); - log1.insert("id", 1); - - let mut log2 = LogEvent::default(); - log2.insert("id", 2); - - let mut log3 = LogEvent::default(); - log3.insert("id", 3); - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "id", - DataType::Int64, - false, // Not nullable - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 3); - - let id_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(id_array.value(0), 1); - assert_eq!(id_array.value(1), 2); - assert_eq!(id_array.value(2), 3); - assert!(!id_array.is_null(0)); - assert!(!id_array.is_null(1)); - assert!(!id_array.is_null(2)); - } - - #[test] - fn test_config_allow_nullable_fields_overrides_schema() { - use tokio_util::codec::Encoder; - - // Create events: One valid, one missing the "required" field - let mut log1 = LogEvent::default(); - log1.insert("strict_field", 42); - let log2 = LogEvent::default(); - let events = vec![Event::Log(log1), Event::Log(log2)]; - - let schema = Schema::new(vec![Field::new("strict_field", DataType::Int64, false)]); - - let mut config = ArrowStreamSerializerConfig::new(schema); - config.allow_nullable_fields = true; - - let mut serializer = - ArrowStreamSerializer::new(config).expect("Failed to create serializer"); - - let mut buffer = BytesMut::new(); - serializer - .encode(events, &mut buffer) - .expect("Encoding should succeed when allow_nullable_fields is true"); - - let cursor = Cursor::new(buffer); - let mut reader = StreamReader::try_new(cursor, None).expect("Failed to create reader"); - let batch = reader.next().unwrap().expect("Failed to read batch"); - - assert_eq!(batch.num_rows(), 2); - - let binding = batch.schema(); - let output_field = binding.field(0); - assert!( - output_field.is_nullable(), - "The output schema field should have been transformed to nullable=true" - ); - - let array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(array.value(0), 42); - assert!(!array.is_null(0)); - assert!( - array.is_null(1), - "The missing value should be encoded as null" - ); - } - - #[test] - fn test_make_field_nullable_with_nested_types() { - // Test that make_field_nullable recursively handles List and Struct types - - // Create a nested structure: Struct containing a List of Structs - // struct { inner_list: [{ nested_field: Int64 }] } - let inner_struct_field = Field::new("nested_field", DataType::Int64, false); - let inner_struct = - DataType::Struct(arrow::datatypes::Fields::from(vec![inner_struct_field])); - let list_field = Field::new("item", inner_struct, false); - let list_type = DataType::List(Arc::new(list_field)); - let outer_field = Field::new("inner_list", list_type, false); - let outer_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![outer_field])); - - let original_field = Field::new("root", outer_struct, false); - - // Apply make_field_nullable - let nullable_field = make_field_nullable(&original_field); - - // Verify root field is nullable - assert!( - nullable_field.is_nullable(), - "Root field should be nullable" - ); - - // Verify nested struct is nullable - if let DataType::Struct(root_fields) = nullable_field.data_type() { - let inner_list_field = &root_fields[0]; - assert!( - inner_list_field.is_nullable(), - "inner_list field should be nullable" - ); - - // Verify list element is nullable - if let DataType::List(list_item_field) = inner_list_field.data_type() { - assert!( - list_item_field.is_nullable(), - "List item field should be nullable" - ); - - // Verify inner struct fields are nullable - if let DataType::Struct(inner_struct_fields) = list_item_field.data_type() { - let nested_field = &inner_struct_fields[0]; - assert!( - nested_field.is_nullable(), - "nested_field should be nullable" - ); - } else { - panic!("Expected Struct type for list items"); - } - } else { - panic!("Expected List type for inner_list"); - } - } else { - panic!("Expected Struct type for root field"); - } - } - - #[test] - fn test_make_field_nullable_with_map_type() { - // Test that make_field_nullable handles Map types - // Map is internally represented as List> - - // Create a map: Map - // Internally: List> - let key_field = Field::new("key", DataType::Utf8, false); - let value_field = Field::new("value", DataType::Int64, false); - let entries_struct = - DataType::Struct(arrow::datatypes::Fields::from(vec![key_field, value_field])); - let entries_field = Field::new("entries", entries_struct, false); - let map_type = DataType::Map(Arc::new(entries_field), false); - - let original_field = Field::new("my_map", map_type, false); - - // Apply make_field_nullable - let nullable_field = make_field_nullable(&original_field); - - // Verify root field is nullable - assert!( - nullable_field.is_nullable(), - "Root map field should be nullable" - ); - - // Verify map entries are nullable - if let DataType::Map(entries_field, _sorted) = nullable_field.data_type() { - assert!( - entries_field.is_nullable(), - "Map entries field should be nullable" - ); - - // Verify the struct inside the map is nullable - if let DataType::Struct(struct_fields) = entries_field.data_type() { - let key_field = &struct_fields[0]; - let value_field = &struct_fields[1]; - assert!(key_field.is_nullable(), "Map key field should be nullable"); - assert!( - value_field.is_nullable(), - "Map value field should be nullable" - ); - } else { - panic!("Expected Struct type for map entries"); - } - } else { - panic!("Expected Map type for my_map field"); - } - } -} diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs new file mode 100644 index 0000000000000..1b9af9bb721dc --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -0,0 +1,66 @@ +use arrow::{ + array::ArrayRef, + datatypes::{DataType, Schema}, + record_batch::RecordBatch, +}; +use std::sync::Arc; +use vector_core::event::Event; + +use crate::encoding::format::arrow::{ + ArrowEncodingError, + types::{ + build_binary_array, build_boolean_array, build_decimal128_array, build_decimal256_array, + build_float32_array, build_float64_array, build_int8_array, build_int16_array, + build_int32_array, build_int64_array, build_string_array, build_timestamp_array, + build_uint8_array, build_uint16_array, build_uint32_array, build_uint64_array, + }, +}; + +/// Builds an Arrow RecordBatch from events +pub(crate) fn build_record_batch( + schema: Arc, + events: &[Event], +) -> Result { + let num_fields = schema.fields().len(); + let mut columns: Vec = Vec::with_capacity(num_fields); + + for field in schema.fields() { + let field_name = field.name(); + let nullable = field.is_nullable(); + let array: ArrayRef = match field.data_type() { + DataType::Timestamp(time_unit, _) => { + build_timestamp_array(events, field_name, *time_unit, nullable)? + } + DataType::Utf8 => build_string_array(events, field_name, nullable)?, + DataType::Int8 => build_int8_array(events, field_name, nullable)?, + DataType::Int16 => build_int16_array(events, field_name, nullable)?, + DataType::Int32 => build_int32_array(events, field_name, nullable)?, + DataType::Int64 => build_int64_array(events, field_name, nullable)?, + DataType::UInt8 => build_uint8_array(events, field_name, nullable)?, + DataType::UInt16 => build_uint16_array(events, field_name, nullable)?, + DataType::UInt32 => build_uint32_array(events, field_name, nullable)?, + DataType::UInt64 => build_uint64_array(events, field_name, nullable)?, + DataType::Float32 => build_float32_array(events, field_name, nullable)?, + DataType::Float64 => build_float64_array(events, field_name, nullable)?, + DataType::Boolean => build_boolean_array(events, field_name, nullable)?, + DataType::Binary => build_binary_array(events, field_name, nullable)?, + DataType::Decimal128(precision, scale) => { + build_decimal128_array(events, field_name, *precision, *scale, nullable)? + } + DataType::Decimal256(precision, scale) => { + build_decimal256_array(events, field_name, *precision, *scale, nullable)? + } + other_type => { + return Err(ArrowEncodingError::UnsupportedType { + field_name: field_name.into(), + data_type: other_type.clone(), + }); + } + }; + + columns.push(array); + } + + RecordBatch::try_new(schema, columns) + .map_err(|source| ArrowEncodingError::RecordBatchCreation { source }) +} diff --git a/lib/codecs/src/encoding/format/arrow/mod.rs b/lib/codecs/src/encoding/format/arrow/mod.rs new file mode 100644 index 0000000000000..4eb300a9406ce --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/mod.rs @@ -0,0 +1,254 @@ +//! Arrow IPC streaming format codec for batched event encoding +//! +//! Provides Apache Arrow IPC stream format encoding with static schema support. +//! This implements the streaming variant of the Arrow IPC protocol, which writes +//! a continuous stream of record batches without a file footer. + +mod builder; +mod types; + +#[cfg(test)] +mod tests; + +use arrow::{ + datatypes::{DataType, Schema}, + ipc::writer::StreamWriter, +}; +use async_trait::async_trait; +use bytes::{BufMut, Bytes, BytesMut}; +use snafu::Snafu; +use std::sync::Arc; +use vector_config::configurable_component; + +use builder::build_record_batch; + +/// Provides Arrow schema for encoding. +/// +/// Sinks can implement this trait to provide custom schema fetching logic. +#[async_trait] +pub trait SchemaProvider: Send + Sync + std::fmt::Debug { + /// Fetch the Arrow schema from the data store. + /// + /// This is called during sink configuration build phase to fetch + /// the schema once at startup, rather than at runtime. + async fn get_schema(&self) -> Result; +} + +/// Configuration for Arrow IPC stream serialization +#[configurable_component] +#[derive(Clone, Default)] +pub struct ArrowStreamSerializerConfig { + /// The Arrow schema to use for encoding + #[serde(skip)] + #[configurable(derived)] + pub schema: Option, + + /// Allow null values for non-nullable fields in the schema. + /// + /// When enabled, missing or incompatible values will be encoded as null even for fields + /// marked as non-nullable in the Arrow schema. This is useful when working with downstream + /// systems that can handle null values through defaults, computed columns, or other mechanisms. + /// + /// When disabled (default), missing values for non-nullable fields will cause encoding errors, + /// ensuring all required data is present before sending to the sink. + #[serde(default)] + #[configurable(derived)] + pub allow_nullable_fields: bool, +} + +impl std::fmt::Debug for ArrowStreamSerializerConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ArrowStreamSerializerConfig") + .field( + "schema", + &self + .schema + .as_ref() + .map(|s| format!("{} fields", s.fields().len())), + ) + .field("allow_nullable_fields", &self.allow_nullable_fields) + .finish() + } +} + +impl ArrowStreamSerializerConfig { + /// Create a new ArrowStreamSerializerConfig with a schema + pub fn new(schema: arrow::datatypes::Schema) -> Self { + Self { + schema: Some(schema), + allow_nullable_fields: false, + } + } + + /// The data type of events that are accepted by `ArrowStreamEncoder`. + pub fn input_type(&self) -> vector_core::config::DataType { + vector_core::config::DataType::Log + } + + /// The schema required by the serializer. + pub fn schema_requirement(&self) -> vector_core::schema::Requirement { + vector_core::schema::Requirement::empty() + } +} + +/// Arrow IPC stream batch serializer that holds the schema +#[derive(Clone, Debug)] +pub struct ArrowStreamSerializer { + schema: Arc, +} + +impl ArrowStreamSerializer { + /// Create a new ArrowStreamSerializer with the given configuration + pub fn new(config: ArrowStreamSerializerConfig) -> Result { + let schema = config + .schema + .ok_or_else(|| vector_common::Error::from("Arrow serializer requires a schema."))?; + + // If allow_nullable_fields is enabled, transform the schema once here + // instead of on every batch encoding + let schema = if config.allow_nullable_fields { + Schema::new_with_metadata( + schema + .fields() + .iter() + .map(|f| Arc::new(make_field_nullable(f))) + .collect::>(), + schema.metadata().clone(), + ) + } else { + schema + }; + + Ok(Self { + schema: Arc::new(schema), + }) + } +} + +impl tokio_util::codec::Encoder> for ArrowStreamSerializer { + type Error = ArrowEncodingError; + + fn encode( + &mut self, + events: Vec, + buffer: &mut BytesMut, + ) -> Result<(), Self::Error> { + if events.is_empty() { + return Err(ArrowEncodingError::NoEvents); + } + + let bytes = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&self.schema)))?; + + buffer.extend_from_slice(&bytes); + Ok(()) + } +} + +/// Errors that can occur during Arrow encoding +#[derive(Debug, Snafu)] +pub enum ArrowEncodingError { + /// Failed to create Arrow record batch + #[snafu(display("Failed to create Arrow record batch: {}", source))] + RecordBatchCreation { + /// The underlying Arrow error + source: arrow::error::ArrowError, + }, + + /// Failed to write Arrow IPC data + #[snafu(display("Failed to write Arrow IPC data: {}", source))] + IpcWrite { + /// The underlying Arrow error + source: arrow::error::ArrowError, + }, + + /// No events provided for encoding + #[snafu(display("No events provided for encoding"))] + NoEvents, + + /// Schema must be provided before encoding + #[snafu(display("Schema must be provided before encoding"))] + NoSchemaProvided, + + /// Failed to fetch schema from provider + #[snafu(display("Failed to fetch schema from provider: {}", message))] + SchemaFetchError { + /// Error message from the provider + message: String, + }, + + /// Unsupported Arrow data type for field + #[snafu(display( + "Unsupported Arrow data type for field '{}': {:?}", + field_name, + data_type + ))] + UnsupportedType { + /// The field name + field_name: String, + /// The unsupported data type + data_type: DataType, + }, + + /// Null value encountered for non-nullable field + #[snafu(display("Null value for non-nullable field '{}'", field_name))] + NullConstraint { + /// The field name + field_name: String, + }, + + /// IO error during encoding + #[snafu(display("IO error: {}", source))] + Io { + /// The underlying IO error + source: std::io::Error, + }, +} + +impl From for ArrowEncodingError { + fn from(error: std::io::Error) -> Self { + Self::Io { source: error } + } +} + +/// Encodes a batch of events into Arrow IPC streaming format +pub fn encode_events_to_arrow_ipc_stream( + events: &[vector_core::event::Event], + schema: Option>, +) -> Result { + if events.is_empty() { + return Err(ArrowEncodingError::NoEvents); + } + + let schema_ref = schema.ok_or(ArrowEncodingError::NoSchemaProvided)?; + + let record_batch = build_record_batch(schema_ref, events)?; + + let ipc_err = |source| ArrowEncodingError::IpcWrite { source }; + + let mut buffer = BytesMut::new().writer(); + let mut writer = + StreamWriter::try_new(&mut buffer, record_batch.schema_ref()).map_err(ipc_err)?; + writer.write(&record_batch).map_err(ipc_err)?; + writer.finish().map_err(ipc_err)?; + + Ok(buffer.into_inner().freeze()) +} + +/// Recursively makes a Field and all its nested fields nullable +pub(crate) fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field { + let new_data_type = match field.data_type() { + DataType::List(inner_field) => DataType::List(Arc::new(make_field_nullable(inner_field))), + DataType::Struct(fields) => { + DataType::Struct(fields.iter().map(|f| make_field_nullable(f)).collect()) + } + DataType::Map(inner_field, sorted) => { + DataType::Map(Arc::new(make_field_nullable(inner_field)), *sorted) + } + other => other.clone(), + }; + + field + .clone() + .with_data_type(new_data_type) + .with_nullable(true) +} diff --git a/lib/codecs/src/encoding/format/arrow/tests.rs b/lib/codecs/src/encoding/format/arrow/tests.rs new file mode 100644 index 0000000000000..2cd411c9b5201 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/tests.rs @@ -0,0 +1,1011 @@ +use super::*; +use arrow::{ + array::{ + Array, BinaryArray, BooleanArray, Decimal128Array, Decimal256Array, Float64Array, + Int64Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, + UInt64Array, + }, + datatypes::{Field, TimeUnit}, + ipc::reader::StreamReader, +}; +use bytes::BytesMut; +use chrono::Utc; +use std::io::Cursor; +use tokio_util::codec::Encoder; +use vector_core::event::{Event, LogEvent}; + +#[test] +fn test_encode_all_types() { + let mut log = LogEvent::default(); + log.insert("string_field", "test"); + log.insert("int8_field", 127); + log.insert("int16_field", 32000); + log.insert("int32_field", 1000000); + log.insert("int64_field", 42); + log.insert("float32_field", 3.15); + log.insert("float64_field", 3.15); + log.insert("bool_field", true); + log.insert("bytes_field", bytes::Bytes::from("binary")); + log.insert("timestamp_field", Utc::now()); + + let events = vec![Event::Log(log)]; + + let schema = Arc::new(Schema::new(vec![ + Field::new("string_field", DataType::Utf8, true), + Field::new("int8_field", DataType::Int8, true), + Field::new("int16_field", DataType::Int16, true), + Field::new("int32_field", DataType::Int32, true), + Field::new("int64_field", DataType::Int64, true), + Field::new("float32_field", DataType::Float32, true), + Field::new("float64_field", DataType::Float64, true), + Field::new("bool_field", DataType::Boolean, true), + Field::new("bytes_field", DataType::Binary, true), + Field::new( + "timestamp_field", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + ])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 10); + + // Verify string field + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + "test" + ); + + // Verify int8 field + assert_eq!( + batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 127 + ); + + // Verify int16 field + assert_eq!( + batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 32000 + ); + + // Verify int32 field + assert_eq!( + batch + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 1000000 + ); + + // Verify int64 field + assert_eq!( + batch + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 42 + ); + + // Verify float32 field + assert!( + (batch + .column(5) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + - 3.15) + .abs() + < 0.001 + ); + + // Verify float64 field + assert!( + (batch + .column(6) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + - 3.15) + .abs() + < 0.001 + ); + + // Verify boolean field + assert!( + batch + .column(7) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + "{}", + true + ); + + // Verify binary field + assert_eq!( + batch + .column(8) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + b"binary" + ); + + // Verify timestamp field + assert!( + !batch + .column(9) + .as_any() + .downcast_ref::() + .unwrap() + .is_null(0) + ); +} + +#[test] +fn test_encode_null_values() { + let mut log1 = LogEvent::default(); + log1.insert("field_a", 1); + // field_b is missing + + let mut log2 = LogEvent::default(); + log2.insert("field_b", 2); + // field_a is missing + + let events = vec![Event::Log(log1), Event::Log(log2)]; + + let schema = Arc::new(Schema::new(vec![ + Field::new("field_a", DataType::Int64, true), + Field::new("field_b", DataType::Int64, true), + ])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 2); + + let field_a = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(field_a.value(0), 1); + assert!(field_a.is_null(1)); + + let field_b = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(field_b.is_null(0)); + assert_eq!(field_b.value(1), 2); +} + +#[test] +fn test_encode_type_mismatches() { + let mut log1 = LogEvent::default(); + log1.insert("field", 42); // Integer + + let mut log2 = LogEvent::default(); + log2.insert("field", 3.15); // Float - type mismatch! + + let events = vec![Event::Log(log1), Event::Log(log2)]; + + // Schema expects Int64 + let schema = Arc::new(Schema::new(vec![Field::new( + "field", + DataType::Int64, + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 2); + + let field_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(field_array.value(0), 42); + assert!(field_array.is_null(1)); // Type mismatch becomes null +} + +#[test] +fn test_encode_complex_json_values() { + use serde_json::json; + + let mut log = LogEvent::default(); + log.insert( + "object_field", + json!({"key": "value", "nested": {"count": 42}}), + ); + log.insert("array_field", json!([1, 2, 3])); + + let events = vec![Event::Log(log)]; + + let schema = Arc::new(Schema::new(vec![ + Field::new("object_field", DataType::Utf8, true), + Field::new("array_field", DataType::Utf8, true), + ])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let object_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let object_str = object_array.value(0); + assert!(object_str.contains("key")); + assert!(object_str.contains("value")); + + let array_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let array_str = array_array.value(0); + assert_eq!(array_str, "[1,2,3]"); +} + +#[test] +fn test_encode_unsupported_type() { + let mut log = LogEvent::default(); + log.insert("field", "value"); + + let events = vec![Event::Log(log)]; + + // Use an unsupported type + let schema = Arc::new(Schema::new(vec![Field::new( + "field", + DataType::Duration(TimeUnit::Millisecond), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + ArrowEncodingError::UnsupportedType { .. } + )); +} + +#[test] +fn test_encode_without_schema_fails() { + let mut log1 = LogEvent::default(); + log1.insert("message", "hello"); + + let events = vec![Event::Log(log1)]; + + let result = encode_events_to_arrow_ipc_stream(&events, None); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + ArrowEncodingError::NoSchemaProvided + )); +} + +#[test] +fn test_encode_empty_events() { + let events: Vec = vec![]; + let result = encode_events_to_arrow_ipc_stream(&events, None); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), ArrowEncodingError::NoEvents)); +} + +#[test] +fn test_encode_timestamp_precisions() { + let now = Utc::now(); + let mut log = LogEvent::default(); + log.insert("ts_second", now); + log.insert("ts_milli", now); + log.insert("ts_micro", now); + log.insert("ts_nano", now); + + let events = vec![Event::Log(log)]; + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "ts_second", + DataType::Timestamp(TimeUnit::Second, None), + true, + ), + Field::new( + "ts_milli", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new( + "ts_micro", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "ts_nano", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + ])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 4); + + let ts_second = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_second.is_null(0)); + assert_eq!(ts_second.value(0), now.timestamp()); + + let ts_milli = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_milli.is_null(0)); + assert_eq!(ts_milli.value(0), now.timestamp_millis()); + + let ts_micro = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_micro.is_null(0)); + assert_eq!(ts_micro.value(0), now.timestamp_micros()); + + let ts_nano = batch + .column(3) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_nano.is_null(0)); + assert_eq!(ts_nano.value(0), now.timestamp_nanos_opt().unwrap()); +} + +#[test] +fn test_encode_mixed_timestamp_string_and_native() { + // Test mixing string timestamps with native Timestamp values + let mut log1 = LogEvent::default(); + log1.insert("ts", "2025-10-22T10:18:44.256Z"); // String + + let mut log2 = LogEvent::default(); + log2.insert("ts", Utc::now()); // Native Timestamp + + let mut log3 = LogEvent::default(); + log3.insert("ts", 1729594724256000000_i64); // Integer (nanoseconds) + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = Arc::new(Schema::new(vec![Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 3); + + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // All three should be non-null + assert!(!ts_array.is_null(0)); + assert!(!ts_array.is_null(1)); + assert!(!ts_array.is_null(2)); + + // First one should match the parsed string + let expected = chrono::DateTime::parse_from_rfc3339("2025-10-22T10:18:44.256Z") + .unwrap() + .timestamp_nanos_opt() + .unwrap(); + assert_eq!(ts_array.value(0), expected); + + // Third one should match the integer + assert_eq!(ts_array.value(2), 1729594724256000000_i64); +} + +#[test] +fn test_encode_invalid_string_timestamp() { + // Test that invalid timestamp strings become null + let mut log1 = LogEvent::default(); + log1.insert("timestamp", "not-a-timestamp"); + + let mut log2 = LogEvent::default(); + log2.insert("timestamp", "2025-10-22T10:18:44.256Z"); // Valid + + let mut log3 = LogEvent::default(); + log3.insert("timestamp", "2025-99-99T99:99:99Z"); // Invalid + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = Arc::new(Schema::new(vec![Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 3); + + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // Invalid timestamps should be null + assert!(ts_array.is_null(0)); + assert!(!ts_array.is_null(1)); // Valid one + assert!(ts_array.is_null(2)); +} + +#[test] +fn test_encode_decimal128_from_integer() { + let mut log = LogEvent::default(); + // Store quantity as integer: 1000 + log.insert("quantity", 1000_i64); + + let events = vec![Event::Log(log)]; + + // Decimal(10, 3) - will represent 1000 as 1000.000 + let schema = Arc::new(Schema::new(vec![Field::new( + "quantity", + DataType::Decimal128(10, 3), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(!decimal_array.is_null(0)); + // 1000 with scale 3 = 1000 * 10^3 = 1000000 + assert_eq!(decimal_array.value(0), 1000000_i128); +} + +#[test] +fn test_encode_decimal256() { + let mut log = LogEvent::default(); + // Very large precision number + log.insert("big_value", 123456789.123456_f64); + + let events = vec![Event::Log(log)]; + + // Decimal256(50, 6) - high precision decimal + let schema = Arc::new(Schema::new(vec![Field::new( + "big_value", + DataType::Decimal256(50, 6), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(!decimal_array.is_null(0)); + // Value should be non-null and encoded + let value = decimal_array.value(0); + assert!(value.to_i128().is_some()); +} + +#[test] +fn test_encode_decimal_null_values() { + let mut log1 = LogEvent::default(); + log1.insert("price", 99.99_f64); + + let log2 = LogEvent::default(); + // No price field - should be null + + let mut log3 = LogEvent::default(); + log3.insert("price", 50.00_f64); + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = Arc::new(Schema::new(vec![Field::new( + "price", + DataType::Decimal128(10, 2), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 3); + + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // First row: 99.99 + assert!(!decimal_array.is_null(0)); + assert_eq!(decimal_array.value(0), 9999_i128); + + // Second row: null + assert!(decimal_array.is_null(1)); + + // Third row: 50.00 + assert!(!decimal_array.is_null(2)); + assert_eq!(decimal_array.value(2), 5000_i128); +} + +#[test] +fn test_encode_unsigned_integer_types() { + let mut log = LogEvent::default(); + log.insert("uint8_field", 255_i64); + log.insert("uint16_field", 65535_i64); + log.insert("uint32_field", 4294967295_i64); + log.insert("uint64_field", 9223372036854775807_i64); + + let events = vec![Event::Log(log)]; + + let schema = Arc::new(Schema::new(vec![ + Field::new("uint8_field", DataType::UInt8, true), + Field::new("uint16_field", DataType::UInt16, true), + Field::new("uint32_field", DataType::UInt32, true), + Field::new("uint64_field", DataType::UInt64, true), + ])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 4); + + // Verify uint8 + let uint8_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint8_array.value(0), 255_u8); + + // Verify uint16 + let uint16_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint16_array.value(0), 65535_u16); + + // Verify uint32 + let uint32_array = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint32_array.value(0), 4294967295_u32); + + // Verify uint64 + let uint64_array = batch + .column(3) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint64_array.value(0), 9223372036854775807_u64); +} + +#[test] +fn test_encode_unsigned_integers_with_null_and_overflow() { + let mut log1 = LogEvent::default(); + log1.insert("uint8_field", 100_i64); + log1.insert("uint32_field", 1000_i64); + + let mut log2 = LogEvent::default(); + log2.insert("uint8_field", 300_i64); // Overflow - should be null + log2.insert("uint32_field", -1_i64); // Negative - should be null + + let log3 = LogEvent::default(); + // Missing fields - should be null + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = Arc::new(Schema::new(vec![ + Field::new("uint8_field", DataType::UInt8, true), + Field::new("uint32_field", DataType::UInt32, true), + ])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 3); + + // Check uint8 column + let uint8_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint8_array.value(0), 100_u8); // Valid + assert!(uint8_array.is_null(1)); // Overflow + assert!(uint8_array.is_null(2)); // Missing + + // Check uint32 column + let uint32_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint32_array.value(0), 1000_u32); // Valid + assert!(uint32_array.is_null(1)); // Negative + assert!(uint32_array.is_null(2)); // Missing +} + +#[test] +fn test_encode_non_nullable_field_with_null_value() { + // Test that encoding fails when a non-nullable field encounters a null value + let mut log1 = LogEvent::default(); + log1.insert("required_field", 42); + + let log2 = LogEvent::default(); + // log2 is missing required_field - should cause an error + + let events = vec![Event::Log(log1), Event::Log(log2)]; + + // Create schema with non-nullable field + let schema = Arc::new(Schema::new(vec![Field::new( + "required_field", + DataType::Int64, + false, // Not nullable + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); + assert!(result.is_err()); + + match result.unwrap_err() { + ArrowEncodingError::NullConstraint { field_name } => { + assert_eq!(field_name, "required_field"); + } + other => panic!("Expected NullConstraint error, got: {:?}", other), + } +} + +#[test] +fn test_encode_non_nullable_string_field_with_missing_value() { + // Test that encoding fails for non-nullable string field + let mut log1 = LogEvent::default(); + log1.insert("name", "Alice"); + + let mut log2 = LogEvent::default(); + log2.insert("name", "Bob"); + + let log3 = LogEvent::default(); + // log3 is missing name field + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = Arc::new(Schema::new(vec![Field::new( + "name", + DataType::Utf8, + false, // Not nullable + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); + assert!(result.is_err()); + + match result.unwrap_err() { + ArrowEncodingError::NullConstraint { field_name } => { + assert_eq!(field_name, "name"); + } + other => panic!("Expected NullConstraint error, got: {:?}", other), + } +} + +#[test] +fn test_encode_non_nullable_field_all_values_present() { + // Test that encoding succeeds when all values are present for non-nullable field + let mut log1 = LogEvent::default(); + log1.insert("id", 1); + + let mut log2 = LogEvent::default(); + log2.insert("id", 2); + + let mut log3 = LogEvent::default(); + log3.insert("id", 3); + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let schema = Arc::new(Schema::new(vec![Field::new( + "id", + DataType::Int64, + false, // Not nullable + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 3); + + let id_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(id_array.value(0), 1); + assert_eq!(id_array.value(1), 2); + assert_eq!(id_array.value(2), 3); + assert!(!id_array.is_null(0)); + assert!(!id_array.is_null(1)); + assert!(!id_array.is_null(2)); +} + +#[test] +fn test_config_allow_nullable_fields_overrides_schema() { + // Create events: One valid, one missing the "required" field + let mut log1 = LogEvent::default(); + log1.insert("strict_field", 42); + let log2 = LogEvent::default(); + let events = vec![Event::Log(log1), Event::Log(log2)]; + + let schema = Schema::new(vec![Field::new("strict_field", DataType::Int64, false)]); + + let mut config = ArrowStreamSerializerConfig::new(schema); + config.allow_nullable_fields = true; + + let mut serializer = ArrowStreamSerializer::new(config).expect("Failed to create serializer"); + + let mut buffer = BytesMut::new(); + serializer + .encode(events, &mut buffer) + .expect("Encoding should succeed when allow_nullable_fields is true"); + + let cursor = Cursor::new(buffer); + let mut reader = StreamReader::try_new(cursor, None).expect("Failed to create reader"); + let batch = reader.next().unwrap().expect("Failed to read batch"); + + assert_eq!(batch.num_rows(), 2); + + let binding = batch.schema(); + let output_field = binding.field(0); + assert!( + output_field.is_nullable(), + "The output schema field should have been transformed to nullable=true" + ); + + let array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(array.value(0), 42); + assert!(!array.is_null(0)); + assert!( + array.is_null(1), + "The missing value should be encoded as null" + ); +} + +#[test] +fn test_make_field_nullable_with_nested_types() { + use crate::encoding::format::arrow::make_field_nullable; + + // Test that make_field_nullable recursively handles List and Struct types + + // Create a nested structure: Struct containing a List of Structs + // struct { inner_list: [{ nested_field: Int64 }] } + let inner_struct_field = Field::new("nested_field", DataType::Int64, false); + let inner_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![inner_struct_field])); + let list_field = Field::new("item", inner_struct, false); + let list_type = DataType::List(Arc::new(list_field)); + let outer_field = Field::new("inner_list", list_type, false); + let outer_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![outer_field])); + + let original_field = Field::new("root", outer_struct, false); + + // Apply make_field_nullable + let nullable_field = make_field_nullable(&original_field); + + // Verify root field is nullable + assert!( + nullable_field.is_nullable(), + "Root field should be nullable" + ); + + // Verify nested struct is nullable + if let DataType::Struct(root_fields) = nullable_field.data_type() { + let inner_list_field = &root_fields[0]; + assert!( + inner_list_field.is_nullable(), + "inner_list field should be nullable" + ); + + // Verify list element is nullable + if let DataType::List(list_item_field) = inner_list_field.data_type() { + assert!( + list_item_field.is_nullable(), + "List item field should be nullable" + ); + + // Verify inner struct fields are nullable + if let DataType::Struct(inner_struct_fields) = list_item_field.data_type() { + let nested_field = &inner_struct_fields[0]; + assert!( + nested_field.is_nullable(), + "nested_field should be nullable" + ); + } else { + panic!("Expected Struct type for list items"); + } + } else { + panic!("Expected List type for inner_list"); + } + } else { + panic!("Expected Struct type for root field"); + } +} + +#[test] +fn test_make_field_nullable_with_map_type() { + use crate::encoding::format::arrow::make_field_nullable; + + // Test that make_field_nullable handles Map types + // Map is internally represented as List> + + // Create a map: Map + // Internally: List> + let key_field = Field::new("key", DataType::Utf8, false); + let value_field = Field::new("value", DataType::Int64, false); + let entries_struct = + DataType::Struct(arrow::datatypes::Fields::from(vec![key_field, value_field])); + let entries_field = Field::new("entries", entries_struct, false); + let map_type = DataType::Map(Arc::new(entries_field), false); + + let original_field = Field::new("my_map", map_type, false); + + // Apply make_field_nullable + let nullable_field = make_field_nullable(&original_field); + + // Verify root field is nullable + assert!( + nullable_field.is_nullable(), + "Root map field should be nullable" + ); + + // Verify map entries are nullable + if let DataType::Map(entries_field, _sorted) = nullable_field.data_type() { + assert!( + entries_field.is_nullable(), + "Map entries field should be nullable" + ); + + // Verify the struct inside the map is nullable + if let DataType::Struct(struct_fields) = entries_field.data_type() { + let key_field = &struct_fields[0]; + let value_field = &struct_fields[1]; + assert!(key_field.is_nullable(), "Map key field should be nullable"); + assert!( + value_field.is_nullable(), + "Map value field should be nullable" + ); + } else { + panic!("Expected Struct type for map entries"); + } + } else { + panic!("Expected Map type for my_map field"); + } +} diff --git a/lib/codecs/src/encoding/format/arrow/types/decimal.rs b/lib/codecs/src/encoding/format/arrow/types/decimal.rs new file mode 100644 index 0000000000000..a39fd32d12dc7 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/types/decimal.rs @@ -0,0 +1,116 @@ +use arrow::{ + array::{ArrayRef, Decimal128Builder, Decimal256Builder}, + datatypes::{DataType, i256}, +}; +use rust_decimal::Decimal; +use std::sync::Arc; +use vector_core::event::{Event, Value}; + +use crate::encoding::format::arrow::ArrowEncodingError; + +/// Macro to handle appending null or returning an error for non-nullable fields. +macro_rules! handle_null_constraints { + ($builder:expr, $nullable:expr, $field_name:expr) => {{ + if !$nullable { + return Err(ArrowEncodingError::NullConstraint { + field_name: $field_name.into(), + }); + } + $builder.append_null(); + }}; +} + +pub(crate) fn build_decimal128_array( + events: &[Event], + field_name: &str, + precision: u8, + scale: i8, + nullable: bool, +) -> Result { + let mut builder = Decimal128Builder::with_capacity(events.len()) + .with_precision_and_scale(precision, scale) + .map_err(|_| ArrowEncodingError::UnsupportedType { + field_name: field_name.into(), + data_type: DataType::Decimal128(precision, scale), + })?; + + let target_scale = scale.unsigned_abs() as u32; + + for event in events { + if let Event::Log(log) = event { + let mut appended = false; + match log.get(field_name) { + Some(Value::Float(f)) => { + if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { + decimal.rescale(target_scale); + let mantissa = decimal.mantissa(); + builder.append_value(mantissa); + appended = true; + } + } + Some(Value::Integer(i)) => { + let mut decimal = Decimal::from(*i); + decimal.rescale(target_scale); + let mantissa = decimal.mantissa(); + builder.append_value(mantissa); + appended = true; + } + _ => {} + } + + if !appended { + handle_null_constraints!(builder, nullable, field_name); + } + } + } + + Ok(Arc::new(builder.finish())) +} + +pub(crate) fn build_decimal256_array( + events: &[Event], + field_name: &str, + precision: u8, + scale: i8, + nullable: bool, +) -> Result { + let mut builder = Decimal256Builder::with_capacity(events.len()) + .with_precision_and_scale(precision, scale) + .map_err(|_| ArrowEncodingError::UnsupportedType { + field_name: field_name.into(), + data_type: DataType::Decimal256(precision, scale), + })?; + + let target_scale = scale.unsigned_abs() as u32; + + for event in events { + if let Event::Log(log) = event { + let mut appended = false; + match log.get(field_name) { + Some(Value::Float(f)) => { + if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { + decimal.rescale(target_scale); + let mantissa = decimal.mantissa(); + // rust_decimal does not support i256 natively so we upcast here + builder.append_value(i256::from_i128(mantissa)); + appended = true; + } + } + Some(Value::Integer(i)) => { + let mut decimal = Decimal::from(*i); + decimal.rescale(target_scale); + let mantissa = decimal.mantissa(); + builder.append_value(i256::from_i128(mantissa)); + appended = true; + } + _ => {} + } + + if !appended { + handle_null_constraints!(builder, nullable, field_name); + } + } + } + + Ok(Arc::new(builder.finish())) +} diff --git a/lib/codecs/src/encoding/format/arrow/types/mod.rs b/lib/codecs/src/encoding/format/arrow/types/mod.rs new file mode 100644 index 0000000000000..f55c958f6d740 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/types/mod.rs @@ -0,0 +1,11 @@ +mod decimal; +mod primitives; +mod temporal; + +pub(crate) use decimal::{build_decimal128_array, build_decimal256_array}; +pub(crate) use primitives::{ + build_binary_array, build_boolean_array, build_float32_array, build_float64_array, + build_int8_array, build_int16_array, build_int32_array, build_int64_array, build_string_array, + build_uint8_array, build_uint16_array, build_uint32_array, build_uint64_array, +}; +pub(crate) use temporal::build_timestamp_array; diff --git a/lib/codecs/src/encoding/format/arrow/types/primitives.rs b/lib/codecs/src/encoding/format/arrow/types/primitives.rs new file mode 100644 index 0000000000000..8978c014d1855 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/types/primitives.rs @@ -0,0 +1,187 @@ +use arrow::array::{ + ArrayRef, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, + Int16Builder, Int32Builder, Int64Builder, StringBuilder, UInt8Builder, UInt16Builder, + UInt32Builder, UInt64Builder, +}; +use std::sync::Arc; +use vector_core::event::{Event, Value}; + +use crate::encoding::format::arrow::ArrowEncodingError; + +/// Macro to handle appending null or returning an error for non-nullable fields. +macro_rules! handle_null_constraints { + ($builder:expr, $nullable:expr, $field_name:expr) => {{ + if !$nullable { + return Err(ArrowEncodingError::NullConstraint { + field_name: $field_name.into(), + }); + } + $builder.append_null(); + }}; +} + +/// Macro to generate a `build_*_array` function for primitive types. +macro_rules! define_build_primitive_array_fn { + ( + $fn_name:ident, // The function name (e.g., build_int8_array) + $builder_ty:ty, // The builder type (e.g., Int8Builder) + // One or more match arms for valid Value types + $( $value_pat:pat $(if $guard:expr)? => $append_expr:expr ),+ + ) => { + pub(crate) fn $fn_name( + events: &[Event], + field_name: &str, + nullable: bool, + ) -> Result { + let mut builder = <$builder_ty>::with_capacity(events.len()); + + for event in events { + if let Event::Log(log) = event { + match log.get(field_name) { + $( + $value_pat $(if $guard)? => builder.append_value($append_expr), + )+ + // All other patterns are treated as null/invalid + _ => handle_null_constraints!(builder, nullable, field_name), + } + } + } + Ok(Arc::new(builder.finish())) + } + }; +} + +pub(crate) fn build_string_array( + events: &[Event], + field_name: &str, + nullable: bool, +) -> Result { + let mut builder = StringBuilder::with_capacity(events.len(), 0); + + for event in events { + if let Event::Log(log) = event { + let mut appended = false; + if let Some(value) = log.get(field_name) { + match value { + Value::Bytes(bytes) => { + // Attempt direct UTF-8 conversion first, fallback to lossy + match std::str::from_utf8(bytes) { + Ok(s) => builder.append_value(s), + Err(_) => builder.append_value(&String::from_utf8_lossy(bytes)), + } + appended = true; + } + Value::Object(obj) => { + if let Ok(s) = serde_json::to_string(&obj) { + builder.append_value(s); + appended = true; + } + } + Value::Array(arr) => { + if let Ok(s) = serde_json::to_string(&arr) { + builder.append_value(s); + appended = true; + } + } + _ => { + builder.append_value(&value.to_string_lossy()); + appended = true; + } + } + } + + if !appended { + handle_null_constraints!(builder, nullable, field_name); + } + } + } + + Ok(Arc::new(builder.finish())) +} + +define_build_primitive_array_fn!( + build_int8_array, + Int8Builder, + Some(Value::Integer(i)) if *i >= i8::MIN as i64 && *i <= i8::MAX as i64 => *i as i8 +); + +define_build_primitive_array_fn!( + build_int16_array, + Int16Builder, + Some(Value::Integer(i)) if *i >= i16::MIN as i64 && *i <= i16::MAX as i64 => *i as i16 +); + +define_build_primitive_array_fn!( + build_int32_array, + Int32Builder, + Some(Value::Integer(i)) if *i >= i32::MIN as i64 && *i <= i32::MAX as i64 => *i as i32 +); + +define_build_primitive_array_fn!( + build_int64_array, + Int64Builder, + Some(Value::Integer(i)) => *i +); + +define_build_primitive_array_fn!( + build_uint8_array, + UInt8Builder, + Some(Value::Integer(i)) if *i >= 0 && *i <= u8::MAX as i64 => *i as u8 +); + +define_build_primitive_array_fn!( + build_uint16_array, + UInt16Builder, + Some(Value::Integer(i)) if *i >= 0 && *i <= u16::MAX as i64 => *i as u16 +); + +define_build_primitive_array_fn!( + build_uint32_array, + UInt32Builder, + Some(Value::Integer(i)) if *i >= 0 && *i <= u32::MAX as i64 => *i as u32 +); + +define_build_primitive_array_fn!( + build_uint64_array, + UInt64Builder, + Some(Value::Integer(i)) if *i >= 0 => *i as u64 +); + +define_build_primitive_array_fn!( + build_float32_array, + Float32Builder, + Some(Value::Float(f)) => f.into_inner() as f32, + Some(Value::Integer(i)) => *i as f32 +); + +define_build_primitive_array_fn!( + build_float64_array, + Float64Builder, + Some(Value::Float(f)) => f.into_inner(), + Some(Value::Integer(i)) => *i as f64 +); + +define_build_primitive_array_fn!( + build_boolean_array, + BooleanBuilder, + Some(Value::Boolean(b)) => *b +); + +pub(crate) fn build_binary_array( + events: &[Event], + field_name: &str, + nullable: bool, +) -> Result { + let mut builder = BinaryBuilder::with_capacity(events.len(), 0); + + for event in events { + if let Event::Log(log) = event { + match log.get(field_name) { + Some(Value::Bytes(bytes)) => builder.append_value(bytes), + _ => handle_null_constraints!(builder, nullable, field_name), + } + } + } + + Ok(Arc::new(builder.finish())) +} diff --git a/lib/codecs/src/encoding/format/arrow/types/temporal.rs b/lib/codecs/src/encoding/format/arrow/types/temporal.rs new file mode 100644 index 0000000000000..72e5578dd27cb --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/types/temporal.rs @@ -0,0 +1,85 @@ +use arrow::{ + array::{ + ArrayRef, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, + TimestampNanosecondBuilder, TimestampSecondBuilder, + }, + datatypes::TimeUnit, +}; +use chrono::{DateTime, Utc}; +use std::sync::Arc; +use vector_core::event::{Event, Value}; + +use crate::encoding::format::arrow::ArrowEncodingError; + +pub(crate) fn extract_timestamp(value: &Value) -> Option> { + match value { + Value::Timestamp(ts) => Some(*ts), + Value::Bytes(bytes) => std::str::from_utf8(bytes) + .ok() + .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok()) + .map(|dt| dt.with_timezone(&Utc)), + _ => None, + } +} + +pub(crate) fn build_timestamp_array( + events: &[Event], + field_name: &str, + time_unit: TimeUnit, + nullable: bool, +) -> Result { + macro_rules! build_array { + ($builder:ty, $converter:expr) => {{ + let mut builder = <$builder>::with_capacity(events.len()); + for event in events { + if let Event::Log(log) = event { + let value_to_append = log.get(field_name).and_then(|value| { + // First, try to extract it as a native or string timestamp + if let Some(ts) = extract_timestamp(value) { + $converter(&ts) + } + // Else, fall back to a raw integer + else if let Value::Integer(i) = value { + Some(*i) + } + // Else, it's an unsupported type (e.g., Bool, Float) + else { + None + } + }); + + if value_to_append.is_none() && !nullable { + return Err(ArrowEncodingError::NullConstraint { + field_name: field_name.into(), + }); + } + + builder.append_option(value_to_append); + } + } + Ok(Arc::new(builder.finish())) + }}; + } + + match time_unit { + TimeUnit::Second => { + build_array!(TimestampSecondBuilder, |ts: &DateTime| Some( + ts.timestamp() + )) + } + TimeUnit::Millisecond => { + build_array!(TimestampMillisecondBuilder, |ts: &DateTime| Some( + ts.timestamp_millis() + )) + } + TimeUnit::Microsecond => { + build_array!(TimestampMicrosecondBuilder, |ts: &DateTime| Some( + ts.timestamp_micros() + )) + } + TimeUnit::Nanosecond => { + build_array!(TimestampNanosecondBuilder, |ts: &DateTime| ts + .timestamp_nanos_opt()) + } + } +} From e68270fe2b9c4661101cabb61cc8226ac5ca1905 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Tue, 23 Dec 2025 19:01:43 +0800 Subject: [PATCH 02/11] enhancement(clickhouse sink): add support for complex types --- .../src/encoding/format/arrow/builder.rs | 12 +- lib/codecs/src/encoding/format/arrow/mod.rs | 22 +- lib/codecs/src/encoding/format/arrow/tests.rs | 1421 +++++++++++++++-- .../encoding/format/arrow/types/complex.rs | 816 ++++++++++ .../src/encoding/format/arrow/types/mod.rs | 76 + src/sinks/clickhouse/arrow/parser.rs | 517 ++++-- src/sinks/clickhouse/arrow/schema.rs | 5 +- src/sinks/clickhouse/integration_tests.rs | 525 +++++- 8 files changed, 3066 insertions(+), 328 deletions(-) create mode 100644 lib/codecs/src/encoding/format/arrow/types/complex.rs diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs index 1b9af9bb721dc..1c7e7a613156d 100644 --- a/lib/codecs/src/encoding/format/arrow/builder.rs +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -11,8 +11,9 @@ use crate::encoding::format::arrow::{ types::{ build_binary_array, build_boolean_array, build_decimal128_array, build_decimal256_array, build_float32_array, build_float64_array, build_int8_array, build_int16_array, - build_int32_array, build_int64_array, build_string_array, build_timestamp_array, - build_uint8_array, build_uint16_array, build_uint32_array, build_uint64_array, + build_int32_array, build_int64_array, build_list_array, build_map_array, + build_string_array, build_struct_array, build_timestamp_array, build_uint8_array, + build_uint16_array, build_uint32_array, build_uint64_array, }, }; @@ -50,6 +51,13 @@ pub(crate) fn build_record_batch( DataType::Decimal256(precision, scale) => { build_decimal256_array(events, field_name, *precision, *scale, nullable)? } + DataType::List(inner_field) => { + build_list_array(events, field_name, inner_field, nullable)? + } + DataType::Struct(fields) => build_struct_array(events, field_name, fields, nullable)?, + DataType::Map(entries_field, _) => { + build_map_array(events, field_name, entries_field, nullable)? + } other_type => { return Err(ArrowEncodingError::UnsupportedType { field_name: field_name.into(), diff --git a/lib/codecs/src/encoding/format/arrow/mod.rs b/lib/codecs/src/encoding/format/arrow/mod.rs index 4eb300a9406ce..c87ad4226043e 100644 --- a/lib/codecs/src/encoding/format/arrow/mod.rs +++ b/lib/codecs/src/encoding/format/arrow/mod.rs @@ -235,14 +235,30 @@ pub fn encode_events_to_arrow_ipc_stream( } /// Recursively makes a Field and all its nested fields nullable -pub(crate) fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field { +fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field { let new_data_type = match field.data_type() { DataType::List(inner_field) => DataType::List(Arc::new(make_field_nullable(inner_field))), DataType::Struct(fields) => { DataType::Struct(fields.iter().map(|f| make_field_nullable(f)).collect()) } - DataType::Map(inner_field, sorted) => { - DataType::Map(Arc::new(make_field_nullable(inner_field)), *sorted) + DataType::Map(inner, sorted) => { + // A Map's inner field is typically a "entries" Struct + let DataType::Struct(fields) = inner.data_type() else { + // Fallback for invalid Map structures (preserves original) + return field.clone().with_nullable(true); + }; + + let new_struct_fields = vec![fields[0].clone(), make_field_nullable(&fields[1]).into()]; + + // Reconstruct the inner "entries" field + // The inner field itself must be non-nullable (only the Map wrapper is nullable) + let new_inner_field = inner + .as_ref() + .clone() + .with_data_type(DataType::Struct(new_struct_fields.into())) + .with_nullable(false); + + DataType::Map(Arc::new(new_inner_field), *sorted) } other => other.clone(), }; diff --git a/lib/codecs/src/encoding/format/arrow/tests.rs b/lib/codecs/src/encoding/format/arrow/tests.rs index 2cd411c9b5201..1a05ed19eb0a2 100644 --- a/lib/codecs/src/encoding/format/arrow/tests.rs +++ b/lib/codecs/src/encoding/format/arrow/tests.rs @@ -1,42 +1,92 @@ use super::*; use arrow::{ array::{ - Array, BinaryArray, BooleanArray, Decimal128Array, Decimal256Array, Float64Array, - Int64Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, - UInt64Array, + Array, ArrayRef, BinaryArray, BooleanArray, Float64Array, Int64Array, ListArray, MapArray, + StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, }, - datatypes::{Field, TimeUnit}, + datatypes::{DataType, Field, Fields, Schema, TimeUnit}, ipc::reader::StreamReader, }; -use bytes::BytesMut; use chrono::Utc; -use std::io::Cursor; -use tokio_util::codec::Encoder; -use vector_core::event::{Event, LogEvent}; +use std::{io::Cursor, sync::Arc}; +use vector_core::event::{Event, LogEvent, Value}; #[test] fn test_encode_all_types() { + use arrow::array::{ + Decimal128Array, ListArray, MapArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array, + }; + use vrl::value::ObjectMap; + + let now = Utc::now(); + + // Create a struct (tuple) value + let mut tuple_value = ObjectMap::new(); + tuple_value.insert("f0".into(), Value::Bytes("nested_str".into())); + tuple_value.insert("f1".into(), Value::Integer(999)); + + // Create a list value + let list_value = Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]); + + // Create a map value + let mut map_value = ObjectMap::new(); + map_value.insert("key1".into(), Value::Integer(100)); + map_value.insert("key2".into(), Value::Integer(200)); + let mut log = LogEvent::default(); + // Primitive types log.insert("string_field", "test"); log.insert("int8_field", 127); log.insert("int16_field", 32000); log.insert("int32_field", 1000000); log.insert("int64_field", 42); + log.insert("uint8_field", 255); + log.insert("uint16_field", 65535); + log.insert("uint32_field", 4000000); + log.insert("uint64_field", 9000000000_i64); log.insert("float32_field", 3.15); log.insert("float64_field", 3.15); log.insert("bool_field", true); log.insert("bytes_field", bytes::Bytes::from("binary")); - log.insert("timestamp_field", Utc::now()); + log.insert("timestamp_field", now); + log.insert("decimal_field", 99.99); + // Complex types + log.insert("list_field", list_value); + log.insert("struct_field", Value::Object(tuple_value)); + log.insert("map_field", Value::Object(map_value)); let events = vec![Event::Log(log)]; + // Build schema with all supported types + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int64, true), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int64, true), + ])), + false, + ); + let schema = Arc::new(Schema::new(vec![ Field::new("string_field", DataType::Utf8, true), Field::new("int8_field", DataType::Int8, true), Field::new("int16_field", DataType::Int16, true), Field::new("int32_field", DataType::Int32, true), Field::new("int64_field", DataType::Int64, true), + Field::new("uint8_field", DataType::UInt8, true), + Field::new("uint16_field", DataType::UInt16, true), + Field::new("uint32_field", DataType::UInt32, true), + Field::new("uint64_field", DataType::UInt64, true), Field::new("float32_field", DataType::Float32, true), Field::new("float64_field", DataType::Float64, true), Field::new("bool_field", DataType::Boolean, true), @@ -46,10 +96,22 @@ fn test_encode_all_types() { DataType::Timestamp(TimeUnit::Millisecond, None), true, ), + Field::new("decimal_field", DataType::Decimal128(10, 2), true), + Field::new( + "list_field", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + true, + ), + Field::new("struct_field", DataType::Struct(struct_fields), true), + Field::new( + "map_field", + DataType::Map(Arc::new(map_entries), false), + true, + ), ])); let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + assert!(result.is_ok(), "Failed to encode: {:?}", result.err()); let bytes = result.unwrap(); let cursor = Cursor::new(bytes); @@ -57,9 +119,9 @@ fn test_encode_all_types() { let batch = reader.next().unwrap().unwrap(); assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 10); + assert_eq!(batch.num_columns(), 18); - // Verify string field + // Verify all primitive types assert_eq!( batch .column(0) @@ -69,8 +131,6 @@ fn test_encode_all_types() { .value(0), "test" ); - - // Verify int8 field assert_eq!( batch .column(1) @@ -80,8 +140,6 @@ fn test_encode_all_types() { .value(0), 127 ); - - // Verify int16 field assert_eq!( batch .column(2) @@ -91,8 +149,6 @@ fn test_encode_all_types() { .value(0), 32000 ); - - // Verify int32 field assert_eq!( batch .column(3) @@ -102,8 +158,6 @@ fn test_encode_all_types() { .value(0), 1000000 ); - - // Verify int64 field assert_eq!( batch .column(4) @@ -113,11 +167,45 @@ fn test_encode_all_types() { .value(0), 42 ); - - // Verify float32 field + assert_eq!( + batch + .column(5) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 255 + ); + assert_eq!( + batch + .column(6) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 65535 + ); + assert_eq!( + batch + .column(7) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 4000000 + ); + assert_eq!( + batch + .column(8) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 9000000000 + ); assert!( (batch - .column(5) + .column(9) .as_any() .downcast_ref::() .unwrap() @@ -126,11 +214,9 @@ fn test_encode_all_types() { .abs() < 0.001 ); - - // Verify float64 field assert!( (batch - .column(6) + .column(10) .as_any() .downcast_ref::() .unwrap() @@ -139,39 +225,82 @@ fn test_encode_all_types() { .abs() < 0.001 ); - - // Verify boolean field assert!( batch - .column(7) + .column(11) .as_any() .downcast_ref::() .unwrap() - .value(0), - "{}", - true + .value(0) ); - - // Verify binary field assert_eq!( batch - .column(8) + .column(12) .as_any() .downcast_ref::() .unwrap() .value(0), b"binary" ); - - // Verify timestamp field - assert!( - !batch - .column(9) + assert_eq!( + batch + .column(13) .as_any() .downcast_ref::() .unwrap() - .is_null(0) + .value(0), + now.timestamp_millis() ); + + let decimal_array: &arrow::array::PrimitiveArray = batch + .column(14) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(decimal_array.value(0), 9999); + + let list_array = batch + .column(15) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 3); + let int_array = list_value.as_any().downcast_ref::().unwrap(); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); + + // Verify struct field + let struct_array = batch + .column(16) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!struct_array.is_null(0)); + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), "nested_str"); + let f1_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f1_array.value(0), 999); + + // Verify map field + let map_array = batch + .column(17) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); } #[test] @@ -519,6 +648,8 @@ fn test_encode_invalid_string_timestamp() { #[test] fn test_encode_decimal128_from_integer() { + use arrow::array::Decimal128Array; + let mut log = LogEvent::default(); // Store quantity as integer: 1000 log.insert("quantity", 1000_i64); @@ -555,6 +686,8 @@ fn test_encode_decimal128_from_integer() { #[test] fn test_encode_decimal256() { + use arrow::array::Decimal256Array; + let mut log = LogEvent::default(); // Very large precision number log.insert("big_value", 123456789.123456_f64); @@ -592,6 +725,8 @@ fn test_encode_decimal256() { #[test] fn test_encode_decimal_null_values() { + use arrow::array::Decimal128Array; + let mut log1 = LogEvent::default(); log1.insert("price", 99.99_f64); @@ -637,69 +772,10 @@ fn test_encode_decimal_null_values() { assert_eq!(decimal_array.value(2), 5000_i128); } -#[test] -fn test_encode_unsigned_integer_types() { - let mut log = LogEvent::default(); - log.insert("uint8_field", 255_i64); - log.insert("uint16_field", 65535_i64); - log.insert("uint32_field", 4294967295_i64); - log.insert("uint64_field", 9223372036854775807_i64); - - let events = vec![Event::Log(log)]; - - let schema = Arc::new(Schema::new(vec![ - Field::new("uint8_field", DataType::UInt8, true), - Field::new("uint16_field", DataType::UInt16, true), - Field::new("uint32_field", DataType::UInt32, true), - Field::new("uint64_field", DataType::UInt64, true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 4); - - // Verify uint8 - let uint8_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint8_array.value(0), 255_u8); - - // Verify uint16 - let uint16_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint16_array.value(0), 65535_u16); - - // Verify uint32 - let uint32_array = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint32_array.value(0), 4294967295_u32); - - // Verify uint64 - let uint64_array = batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint64_array.value(0), 9223372036854775807_u64); -} - #[test] fn test_encode_unsigned_integers_with_null_and_overflow() { + use arrow::array::{UInt8Array, UInt32Array}; + let mut log1 = LogEvent::default(); log1.insert("uint8_field", 100_i64); log1.insert("uint32_field", 1000_i64); @@ -778,37 +854,6 @@ fn test_encode_non_nullable_field_with_null_value() { } } -#[test] -fn test_encode_non_nullable_string_field_with_missing_value() { - // Test that encoding fails for non-nullable string field - let mut log1 = LogEvent::default(); - log1.insert("name", "Alice"); - - let mut log2 = LogEvent::default(); - log2.insert("name", "Bob"); - - let log3 = LogEvent::default(); - // log3 is missing name field - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let schema = Arc::new(Schema::new(vec![Field::new( - "name", - DataType::Utf8, - false, // Not nullable - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - - match result.unwrap_err() { - ArrowEncodingError::NullConstraint { field_name } => { - assert_eq!(field_name, "name"); - } - other => panic!("Expected NullConstraint error, got: {:?}", other), - } -} - #[test] fn test_encode_non_nullable_field_all_values_present() { // Test that encoding succeeds when all values are present for non-nullable field @@ -855,6 +900,8 @@ fn test_encode_non_nullable_field_all_values_present() { #[test] fn test_config_allow_nullable_fields_overrides_schema() { + use tokio_util::codec::Encoder; + // Create events: One valid, one missing the "required" field let mut log1 = LogEvent::default(); log1.insert("strict_field", 42); @@ -902,8 +949,6 @@ fn test_config_allow_nullable_fields_overrides_schema() { #[test] fn test_make_field_nullable_with_nested_types() { - use crate::encoding::format::arrow::make_field_nullable; - // Test that make_field_nullable recursively handles List and Struct types // Create a nested structure: Struct containing a List of Structs @@ -961,8 +1006,6 @@ fn test_make_field_nullable_with_nested_types() { #[test] fn test_make_field_nullable_with_map_type() { - use crate::encoding::format::arrow::make_field_nullable; - // Test that make_field_nullable handles Map types // Map is internally represented as List> @@ -986,18 +1029,24 @@ fn test_make_field_nullable_with_map_type() { "Root map field should be nullable" ); - // Verify map entries are nullable + // Verify map entries nullability matches MapBuilder behavior if let DataType::Map(entries_field, _sorted) = nullable_field.data_type() { + // MapBuilder creates entries struct as non-nullable assert!( - entries_field.is_nullable(), - "Map entries field should be nullable" + !entries_field.is_nullable(), + "Map entries field should be non-nullable to match MapBuilder" ); - // Verify the struct inside the map is nullable + // Verify the struct inside the map if let DataType::Struct(struct_fields) = entries_field.data_type() { let key_field = &struct_fields[0]; let value_field = &struct_fields[1]; - assert!(key_field.is_nullable(), "Map key field should be nullable"); + // MapBuilder keeps keys as non-nullable + assert!( + !key_field.is_nullable(), + "Map key field should be non-nullable to match MapBuilder" + ); + // But values field should be transformed to nullable assert!( value_field.is_nullable(), "Map value field should be nullable" @@ -1009,3 +1058,1089 @@ fn test_make_field_nullable_with_map_type() { panic!("Expected Map type for my_map field"); } } + +#[test] +fn test_encode_nested_maps() { + use arrow::array::MapArray; + use vrl::value::ObjectMap; + + // Create nested map: Map> + // {"outer_key1": {"inner_key1": 100, "inner_key2": 200}, "outer_key2": {"inner_key3": 300}} + let mut inner_map1 = ObjectMap::new(); + inner_map1.insert("inner_key1".into(), Value::Integer(100)); + inner_map1.insert("inner_key2".into(), Value::Integer(200)); + + let mut inner_map2 = ObjectMap::new(); + inner_map2.insert("inner_key3".into(), Value::Integer(300)); + + let mut outer_map = ObjectMap::new(); + outer_map.insert("outer_key1".into(), Value::Object(inner_map1)); + outer_map.insert("outer_key2".into(), Value::Object(inner_map2)); + + let mut log = LogEvent::default(); + log.insert("nested_map", Value::Object(outer_map)); + + let events = vec![Event::Log(log)]; + + // Define schema: Map> + // Note: MapBuilder uses "keys" and "values" (plural) as field names + let inner_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + let inner_map_type = DataType::Map(Arc::new(inner_map_entries), false); + + let outer_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", inner_map_type, true), + ])), + false, + ); + let outer_map_type = DataType::Map(Arc::new(outer_map_entries), false); + + let schema = Arc::new(Schema::new(vec![Field::new( + "nested_map", + outer_map_type, + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode nested maps: {:?}", + result.as_ref().err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the outer map exists + let outer_map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!(outer_map_array.len(), 1); + assert!(!outer_map_array.is_null(0), "Outer map should not be null"); + + // Get the outer map's values (which are inner maps) + let outer_map_value = outer_map_array.value(0); + assert_eq!(outer_map_value.len(), 2, "Outer map should have 2 entries"); + + // The outer map's values are themselves a MapArray + let inner_maps = outer_map_array.values(); + let inner_maps_array = inner_maps.as_any().downcast_ref::().unwrap(); + + // Verify we have 2 inner maps (one for each outer key) + // Total entries across both inner maps: 2 + 1 = 3 + assert_eq!(inner_maps_array.len(), 2, "Should have 2 inner maps"); + + // Verify first inner map has 2 entries + let first_inner_map = inner_maps_array.value(0); + assert_eq!( + first_inner_map.len(), + 2, + "First inner map should have 2 entries" + ); + + // Verify second inner map has 1 entry + let second_inner_map = inner_maps_array.value(1); + assert_eq!( + second_inner_map.len(), + 1, + "Second inner map should have 1 entry" + ); +} + +#[test] +fn test_encode_array_of_maps() { + use arrow::array::ListArray; + use vrl::value::ObjectMap; + + // Create array of maps: Array> + // [{"key1": 100, "key2": 200}, {"key3": 300}] + let mut map1 = ObjectMap::new(); + map1.insert("key1".into(), Value::Integer(100)); + map1.insert("key2".into(), Value::Integer(200)); + + let mut map2 = ObjectMap::new(); + map2.insert("key3".into(), Value::Integer(300)); + + let array_of_maps = Value::Array(vec![Value::Object(map1), Value::Object(map2)]); + + let mut log = LogEvent::default(); + log.insert("array_of_maps", array_of_maps); + + let events = vec![Event::Log(log)]; + + // Define schema: List> + // Note: MapBuilder uses "keys" and "values" (plural) as field names + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + let map_type = DataType::Map(Arc::new(map_entries), false); + let list_field = Field::new("item", map_type, true); + + let schema = Arc::new(Schema::new(vec![Field::new( + "array_of_maps", + DataType::List(Arc::new(list_field)), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode array of maps: {:?}", + result.as_ref().err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the array exists + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0), "Array should not be null"); + assert_eq!(list_array.value(0).len(), 2, "Array should have 2 maps"); + + // Verify the maps inside the array + let maps = list_array.value(0); + let map_array = maps + .as_any() + .downcast_ref::() + .unwrap(); + + // First map should have 2 entries + let first_map = map_array.value(0); + assert_eq!(first_map.len(), 2, "First map should have 2 entries"); + + // Second map should have 1 entry + let second_map = map_array.value(1); + assert_eq!(second_map.len(), 1, "Second map should have 1 entry"); +} + +#[test] +fn test_encode_array_of_structs() { + use arrow::array::ListArray; + use vrl::value::ObjectMap; + + // Create array of structs (tuples): Array + // [{"f0": "value1", "f1": 100}, {"f0": "value2", "f1": 200}] + let mut tuple1 = ObjectMap::new(); + tuple1.insert("f0".into(), Value::Bytes("value1".into())); + tuple1.insert("f1".into(), Value::Integer(100)); + + let mut tuple2 = ObjectMap::new(); + tuple2.insert("f0".into(), Value::Bytes("value2".into())); + tuple2.insert("f1".into(), Value::Integer(200)); + + let array_of_structs = Value::Array(vec![Value::Object(tuple1), Value::Object(tuple2)]); + + let mut log = LogEvent::default(); + log.insert("array_of_structs", array_of_structs); + + let events = vec![Event::Log(log)]; + + // Define schema: List + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int32, true), + ]); + let struct_type = DataType::Struct(struct_fields); + let list_field = Field::new("item", struct_type, true); + + let schema = Arc::new(Schema::new(vec![Field::new( + "array_of_structs", + DataType::List(Arc::new(list_field)), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode array of structs: {:?}", + result.as_ref().err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the array exists and has the correct number of elements + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0), "Array should not be null"); + assert_eq!(list_array.value(0).len(), 2, "Array should have 2 structs"); + + // Verify the structs inside the array + let struct_array = list_array.value(0); + let struct_array = struct_array + .as_any() + .downcast_ref::() + .unwrap(); + + // Check first struct field (f0 - strings) + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), "value1"); + assert_eq!(f0_array.value(1), "value2"); + + // Check second struct field (f1 - integers) + let f1_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f1_array.value(0), 100); + assert_eq!(f1_array.value(1), 200); +} + +#[test] +fn test_encode_empty_arrays_and_maps() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create log with empty array and empty map + let empty_array = Vec::::new(); + let empty_map = ObjectMap::new(); + + let mut log = LogEvent::default(); + log.insert("empty_array", Value::Array(empty_array)); + log.insert("empty_map", Value::Object(empty_map)); + log.insert( + "non_empty_array", + Value::Array(vec![Value::Integer(1), Value::Integer(2)]), + ); + + let events = vec![Event::Log(log)]; + + // Define schema + // Note: MapBuilder uses "keys" and "values" (plural) as field names + let array_field = Field::new("item", DataType::Int32, true); + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "empty_array", + DataType::List(Arc::new(array_field.clone())), + true, + ), + Field::new( + "empty_map", + DataType::Map(Arc::new(map_entries), false), + true, + ), + Field::new( + "non_empty_array", + DataType::List(Arc::new(array_field)), + true, + ), + ])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode empty collections: {:?}", + result.as_ref().err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 3); + + // Verify empty array + let empty_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!empty_array.is_null(0), "Empty array should not be null"); + assert_eq!(empty_array.value(0).len(), 0, "Array should be empty"); + + // Verify empty map + let empty_map = batch.column(1).as_any().downcast_ref::().unwrap(); + assert!(!empty_map.is_null(0), "Empty map should not be null"); + assert_eq!(empty_map.value(0).len(), 0, "Map should be empty"); + + // Verify non-empty array + let non_empty_array = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!non_empty_array.is_null(0)); + assert_eq!(non_empty_array.value(0).len(), 2); +} + +#[test] +fn test_encode_deep_nesting() { + use arrow::array::ListArray; + + // Create deeply nested array structure (6 levels): + // Array -> Array -> Array -> Array -> Array -> Int32 + let level_5 = Value::Array(vec![Value::Integer(42), Value::Integer(99)]); + let level_4 = Value::Array(vec![level_5]); + let level_3 = Value::Array(vec![level_4]); + let level_2 = Value::Array(vec![level_3]); + let level_1 = Value::Array(vec![level_2]); + + let mut log = LogEvent::default(); + log.insert("deep_array", level_1); + + let events = vec![Event::Log(log)]; + + // Define schema for deep array nesting (6 levels total) + let mut current_field = Field::new("item", DataType::Int32, true); + for _ in 0..5 { + current_field = Field::new("item", DataType::List(Arc::new(current_field)), true); + } + + let schema = Arc::new(Schema::new(vec![Field::new( + "deep_array", + current_field.data_type().clone(), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode deeply nested arrays: {:?}", + result.as_ref().err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify deep array by navigating down through all levels + // Store intermediate arrays to avoid lifetime issues + let mut arrays: Vec = Vec::new(); + arrays.push(batch.column(0).clone()); + + // Navigate through 5 nested List levels + for level in 0..5 { + let list_array = arrays[level] + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("Expected ListArray at level {}", level)); + assert!( + !list_array.is_null(0), + "Array should not be null at level {}", + level + ); + assert_eq!( + list_array.len(), + 1, + "Array should have 1 element at level {}", + level + ); + arrays.push(list_array.value(0)); + } + + // Final level (level 5) should be Int32Array with values [42, 99] + let int_array = arrays[5] + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_array.len(), 2, "Final array should have 2 elements"); + assert_eq!(int_array.value(0), 42); + assert_eq!(int_array.value(1), 99); +} + +#[test] +fn test_encode_struct_with_list_and_map() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a struct containing both a list and a map + // Struct { list_field: [1, 2, 3], map_field: {"k1": 10, "k2": 20} } + let mut struct_value = ObjectMap::new(); + struct_value.insert( + "f0".into(), + Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]), + ); + + let mut map_value = ObjectMap::new(); + map_value.insert("k1".into(), Value::Integer(10)); + map_value.insert("k2".into(), Value::Integer(20)); + struct_value.insert("f1".into(), Value::Object(map_value)); + + let mut log = LogEvent::default(); + log.insert("complex_struct", Value::Object(struct_value)); + + let events = vec![Event::Log(log)]; + + // Define schema: Struct { list_field: List, map_field: Map } + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new( + "f0", + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + true, + ), + Field::new("f1", DataType::Map(Arc::new(map_entries), false), true), + ]); + + let schema = Arc::new(Schema::new(vec![Field::new( + "complex_struct", + DataType::Struct(struct_fields), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode struct with list and map: {:?}", + result.err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the struct + let struct_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!struct_array.is_null(0)); + + // Verify the list inside the struct (f0) + let list_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 3); + let int_array = list_value + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); + + // Verify the map inside the struct (f1) + let map_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); +} + +#[test] +fn test_encode_map_with_struct_values() { + use arrow::array::MapArray; + use vrl::value::ObjectMap; + + // Create a map where values are structs + // Map + // {"item1": {"f0": "Alice", "f1": 10}, "item2": {"f0": "Bob", "f1": 20}} + let mut struct1 = ObjectMap::new(); + struct1.insert("f0".into(), Value::Bytes("Alice".into())); + struct1.insert("f1".into(), Value::Integer(10)); + + let mut struct2 = ObjectMap::new(); + struct2.insert("f0".into(), Value::Bytes("Bob".into())); + struct2.insert("f1".into(), Value::Integer(20)); + + let mut map_value = ObjectMap::new(); + map_value.insert("item1".into(), Value::Object(struct1)); + map_value.insert("item2".into(), Value::Object(struct2)); + + let mut log = LogEvent::default(); + log.insert("map_with_structs", Value::Object(map_value)); + + let events = vec![Event::Log(log)]; + + // Define schema: Map + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int32, true), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Struct(struct_fields), true), + ])), + false, + ); + + let schema = Arc::new(Schema::new(vec![Field::new( + "map_with_structs", + DataType::Map(Arc::new(map_entries), false), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode map with struct values: {:?}", + result.err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the map + let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); + + // Verify the struct values in the map + let struct_array = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_array.len(), 2); + + // Check f0 field (names) + let names_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let name1 = names_array.value(0); + let name2 = names_array.value(1); + assert!(name1 == "Alice" || name1 == "Bob"); + assert!(name2 == "Alice" || name2 == "Bob"); + assert_ne!(name1, name2); + + // Check f1 field (counts) + let counts_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(counts_array.value(0) == 10 || counts_array.value(0) == 20); + assert!(counts_array.value(1) == 10 || counts_array.value(1) == 20); +} + +#[test] +fn test_encode_list_of_structs_containing_maps() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a list of structs, where each struct contains a map + // List }> + // [ + // {"f0": 1, "f1": {"color": "red", "size": "large"}}, + // {"f0": 2, "f1": {"color": "blue", "size": "small"}} + // ] + let mut attrs1 = ObjectMap::new(); + attrs1.insert("color".into(), Value::Bytes("red".into())); + attrs1.insert("size".into(), Value::Bytes("large".into())); + + let mut struct1 = ObjectMap::new(); + struct1.insert("f0".into(), Value::Integer(1)); + struct1.insert("f1".into(), Value::Object(attrs1)); + + let mut attrs2 = ObjectMap::new(); + attrs2.insert("color".into(), Value::Bytes("blue".into())); + attrs2.insert("size".into(), Value::Bytes("small".into())); + + let mut struct2 = ObjectMap::new(); + struct2.insert("f0".into(), Value::Integer(2)); + struct2.insert("f1".into(), Value::Object(attrs2)); + + let list_value = Value::Array(vec![Value::Object(struct1), Value::Object(struct2)]); + + let mut log = LogEvent::default(); + log.insert("list_of_structs_with_maps", list_value); + + let events = vec![Event::Log(log)]; + + // Define schema + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + ); + + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Int32, true), + Field::new("f1", DataType::Map(Arc::new(map_entries), false), true), + ]); + + let list_field = Field::new("item", DataType::Struct(struct_fields), true); + + let schema = Arc::new(Schema::new(vec![Field::new( + "list_of_structs_with_maps", + DataType::List(Arc::new(list_field)), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode list of structs with maps: {:?}", + result.err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the list + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 2); + + // Verify the structs in the list + let struct_array = list_value + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_array.len(), 2); + + // Verify IDs (f0) + let id_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_array.value(0), 1); + assert_eq!(id_array.value(1), 2); + + // Verify maps (f1) + let map_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(map_array.len(), 2); + assert!(!map_array.is_null(0)); + assert!(!map_array.is_null(1)); + + // Verify first map has 2 entries + let first_map = map_array.value(0); + assert_eq!(first_map.len(), 2); + + // Verify second map has 2 entries + let second_map = map_array.value(1); + assert_eq!(second_map.len(), 2); +} + +#[test] +fn test_encode_deeply_nested_mixed_types() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a very complex nested structure: + // Struct { + // data: List, metadata: Map }>> + // } + let mut metadata = ObjectMap::new(); + metadata.insert("key1".into(), Value::Bytes("value1".into())); + + let mut inner_struct = ObjectMap::new(); + inner_struct.insert("f0".into(), Value::Array(vec![Value::Integer(100)])); + inner_struct.insert("f1".into(), Value::Object(metadata)); + + let mut map_in_list = ObjectMap::new(); + map_in_list.insert("item_key".into(), Value::Object(inner_struct)); + + let mut outer_struct = ObjectMap::new(); + outer_struct.insert("f0".into(), Value::Array(vec![Value::Object(map_in_list)])); + + let mut log = LogEvent::default(); + log.insert("deeply_nested", Value::Object(outer_struct)); + + let events = vec![Event::Log(log)]; + + // Define schema + let metadata_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + ); + + let inner_struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new( + "f0", + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + true, + ), + Field::new( + "f1", + DataType::Map(Arc::new(metadata_map_entries), false), + true, + ), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Struct(inner_struct_fields), true), + ])), + false, + ); + + let list_field = Field::new("item", DataType::Map(Arc::new(map_entries), false), true); + + let outer_struct_fields = arrow::datatypes::Fields::from(vec![Field::new( + "f0", + DataType::List(Arc::new(list_field)), + true, + )]); + + let schema = Arc::new(Schema::new(vec![Field::new( + "deeply_nested", + DataType::Struct(outer_struct_fields), + true, + )])); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); + assert!( + result.is_ok(), + "Failed to encode deeply nested mixed types: {:?}", + result.err() + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); + + // Verify the outer struct + let outer_struct = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!outer_struct.is_null(0)); + + // Verify the list inside the outer struct + let list_array = outer_struct + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 1); + + // Verify the map inside the list + let map_array = list_value.as_any().downcast_ref::().unwrap(); + assert_eq!(map_array.len(), 1); + assert!(!map_array.is_null(0)); + + // Verify the struct inside the map + let struct_values = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_values.len(), 1); + + // Verify the list inside the struct + let inner_list = struct_values + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!inner_list.is_null(0)); + let inner_list_value = inner_list.value(0); + assert_eq!(inner_list_value.len(), 1); + + // Verify the innermost map + let inner_map = struct_values + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!inner_map.is_null(0)); + let inner_map_value = inner_map.value(0); + assert_eq!(inner_map_value.len(), 1); +} + +#[test] +fn test_automatic_json_serialization_for_array_of_objects() { + use vrl::value::ObjectMap; + + // Create array of objects (like the user's components data) + let mut obj1 = ObjectMap::new(); + obj1.insert("name".into(), Value::Bytes("tick.mexc.spot".into())); + obj1.insert("alias".into(), Value::Bytes("guiusdt".into())); + obj1.insert("expireAfter".into(), Value::Integer(60000)); + + let mut obj2 = ObjectMap::new(); + obj2.insert("name".into(), Value::Bytes("tick.binance".into())); + obj2.insert("alias".into(), Value::Bytes("btcusdt".into())); + obj2.insert("expireAfter".into(), Value::Integer(30000)); + + let components = Value::Array(vec![Value::Object(obj1), Value::Object(obj2)]); + + let mut log = LogEvent::default(); + log.insert("components", components); + + let events = vec![Event::Log(log)]; + + // Schema expects Array(String), but we're providing Array(Object) + // The encoder should automatically serialize objects to JSON strings + let schema = Schema::new(vec![Field::new( + "components", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + false, + )]); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::new(schema))); + assert!( + result.is_ok(), + "Encoding should succeed with automatic JSON serialization" + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + + let list_value = list_array.value(0); + let string_array = list_value.as_any().downcast_ref::().unwrap(); + + // Should have 2 strings (JSON serialized objects) + assert_eq!(string_array.len(), 2); + + // Verify the first object was serialized to JSON + let json1 = string_array.value(0); + assert!(json1.contains("\"name\":\"tick.mexc.spot\"")); + assert!(json1.contains("\"alias\":\"guiusdt\"")); + assert!(json1.contains("\"expireAfter\":60000")); + + // Verify the second object was serialized to JSON + let json2 = string_array.value(1); + assert!(json2.contains("\"name\":\"tick.binance\"")); + assert!(json2.contains("\"alias\":\"btcusdt\"")); + assert!(json2.contains("\"expireAfter\":30000")); +} + +#[test] +fn test_object_in_map_values_to_string() { + use vrl::value::ObjectMap; + + // Create a map with object values: Map + // Schema expects Map, so objects should serialize to JSON + let mut inner_obj = ObjectMap::new(); + inner_obj.insert("config".into(), Value::Bytes("enabled".into())); + inner_obj.insert("timeout".into(), Value::Integer(5000)); + + let mut map_value = ObjectMap::new(); + map_value.insert("setting1".into(), Value::Object(inner_obj)); + map_value.insert("setting2".into(), Value::Bytes("simple string".into())); + + let mut log = LogEvent::default(); + log.insert("settings", Value::Object(map_value)); + + let events = vec![Event::Log(log)]; + + // Schema: Map (expects string values, but we have objects) + let key_field = Field::new("keys", DataType::Utf8, false); + let value_field = Field::new("values", DataType::Utf8, true); + let entries_struct = DataType::Struct(Fields::from(vec![key_field, value_field])); + let entries_field = Field::new("entries", entries_struct, false); + let map_type = DataType::Map(Arc::new(entries_field), false); + + let schema = Schema::new(vec![Field::new("settings", map_type, false)]); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::new(schema))); + assert!( + result.is_ok(), + "Map with object values should serialize to JSON strings" + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + + let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert!(!map_array.is_null(0)); + + // Get the values from the map + let values_array = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + + // One value should be a JSON object, one should be a plain string + let mut found_json_object = false; + let mut found_plain_string = false; + + for i in 0..values_array.len() { + let value = values_array.value(i); + if value.contains("\"config\"") && value.contains("\"timeout\"") { + found_json_object = true; + } else if value == "simple string" { + found_plain_string = true; + } + } + + assert!( + found_json_object, + "Should find JSON-serialized object in map values" + ); + assert!(found_plain_string, "Should find plain string in map values"); +} + +#[test] +fn test_nested_arrays_with_objects() { + use vrl::value::ObjectMap; + + // Array of arrays, where inner arrays contain objects + let mut obj = ObjectMap::new(); + obj.insert("id".into(), Value::Integer(123)); + + let inner_array = Value::Array(vec![Value::Object(obj.clone())]); + let outer_array = Value::Array(vec![inner_array]); + + let mut log = LogEvent::default(); + log.insert("nested", outer_array); + + let events = vec![Event::Log(log)]; + + // Schema: Array(Array(String)) + let inner_field = Field::new("item", DataType::Utf8, true); + let middle_field = Field::new("item", DataType::List(Arc::new(inner_field)), true); + let outer_list = DataType::List(Arc::new(middle_field)); + + let schema = Schema::new(vec![Field::new("nested", outer_list, false)]); + + let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::new(schema))); + assert!( + result.is_ok(), + "Nested arrays with objects should serialize" + ); + + let bytes = result.unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(batch.num_rows(), 1); + + // Navigate to the deepest array + let outer_list = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let outer_value = outer_list.value(0); + let middle_list = outer_value.as_any().downcast_ref::().unwrap(); + let middle_value = middle_list.value(0); + let inner_strings = middle_value.as_any().downcast_ref::().unwrap(); + + // Should have one JSON string + assert_eq!(inner_strings.len(), 1); + let json_str = inner_strings.value(0); + assert!( + json_str.contains("\"id\":123"), + "Deeply nested object should be serialized to JSON" + ); +} diff --git a/lib/codecs/src/encoding/format/arrow/types/complex.rs b/lib/codecs/src/encoding/format/arrow/types/complex.rs new file mode 100644 index 0000000000000..8a3e62ad41ff7 --- /dev/null +++ b/lib/codecs/src/encoding/format/arrow/types/complex.rs @@ -0,0 +1,816 @@ +//! Complex type array builders for Arrow encoding +//! +//! This module handles nested Arrow types: List, Struct (tuples), and Map. + +use arrow::array::{ + ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, + Int8Builder, Int16Builder, Int32Builder, Int64Builder, ListBuilder, MapBuilder, StringBuilder, + StructBuilder, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, + TimestampNanosecondBuilder, TimestampSecondBuilder, UInt8Builder, UInt16Builder, UInt32Builder, + UInt64Builder, +}; +use arrow::datatypes::{DataType, Field, Fields, TimeUnit}; +use std::sync::Arc; + +use super::super::ArrowEncodingError; +use super::create_array_builder_for_type; +use vector_core::event::{Event, Value}; + +/// Helper macro for downcasting builders +macro_rules! downcast_builder { + ($builder:expr, $builder_type:ty) => { + $builder + .as_any_mut() + .downcast_mut::<$builder_type>() + .expect(concat!( + "Failed to downcast builder to ", + stringify!($builder_type) + )) + }; +} + +/// Helper function to serialize a Value to JSON string. +/// This is used when the schema expects a string but the data contains complex types. +fn value_to_json_string(value: &Value) -> Result { + serde_json::to_string(value).map_err(|e| ArrowEncodingError::Io { + source: std::io::Error::new(std::io::ErrorKind::InvalidData, e), + }) +} + +/// Appends a null value to an array builder based on the data type. +fn append_null_to_builder( + builder: &mut dyn ArrayBuilder, + data_type: &DataType, +) -> Result<(), ArrowEncodingError> { + match data_type { + DataType::Int8 => downcast_builder!(builder, Int8Builder).append_null(), + DataType::Int16 => downcast_builder!(builder, Int16Builder).append_null(), + DataType::Int32 => downcast_builder!(builder, Int32Builder).append_null(), + DataType::Int64 => downcast_builder!(builder, Int64Builder).append_null(), + DataType::UInt8 => downcast_builder!(builder, UInt8Builder).append_null(), + DataType::UInt16 => downcast_builder!(builder, UInt16Builder).append_null(), + DataType::UInt32 => downcast_builder!(builder, UInt32Builder).append_null(), + DataType::UInt64 => downcast_builder!(builder, UInt64Builder).append_null(), + DataType::Float32 => downcast_builder!(builder, Float32Builder).append_null(), + DataType::Float64 => downcast_builder!(builder, Float64Builder).append_null(), + DataType::Boolean => downcast_builder!(builder, BooleanBuilder).append_null(), + DataType::Utf8 => downcast_builder!(builder, StringBuilder).append_null(), + DataType::Binary => downcast_builder!(builder, BinaryBuilder).append_null(), + DataType::Timestamp(TimeUnit::Second, _) => { + downcast_builder!(builder, TimestampSecondBuilder).append_null() + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + downcast_builder!(builder, TimestampMillisecondBuilder).append_null() + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + downcast_builder!(builder, TimestampMicrosecondBuilder).append_null() + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + downcast_builder!(builder, TimestampNanosecondBuilder).append_null() + } + DataType::List(_) => { + builder + .as_any_mut() + .downcast_mut::>>() + .expect("Failed to downcast to ListBuilder") + .append_null(); + } + DataType::Struct(_) => downcast_builder!(builder, StructBuilder).append_null(), + DataType::Map(_, _) => { + builder + .as_any_mut() + .downcast_mut::>>() + .expect("Failed to downcast to MapBuilder") + .append(false) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + _ => {} + } + Ok(()) +} + +/// Recursively appends a VRL Value to an Arrow array builder. +fn append_value_to_builder( + builder: &mut dyn ArrayBuilder, + value: &Value, + field: &Field, +) -> Result<(), ArrowEncodingError> { + match (field.data_type(), value) { + // Integer types with range checking + (DataType::Int8, Value::Integer(i)) if *i >= i8::MIN as i64 && *i <= i8::MAX as i64 => { + downcast_builder!(builder, Int8Builder).append_value(*i as i8); + } + (DataType::Int16, Value::Integer(i)) if *i >= i16::MIN as i64 && *i <= i16::MAX as i64 => { + downcast_builder!(builder, Int16Builder).append_value(*i as i16); + } + (DataType::Int32, Value::Integer(i)) if *i >= i32::MIN as i64 && *i <= i32::MAX as i64 => { + downcast_builder!(builder, Int32Builder).append_value(*i as i32); + } + (DataType::Int64, Value::Integer(i)) => { + downcast_builder!(builder, Int64Builder).append_value(*i); + } + (DataType::UInt8, Value::Integer(i)) if *i >= 0 && *i <= u8::MAX as i64 => { + downcast_builder!(builder, UInt8Builder).append_value(*i as u8); + } + (DataType::UInt16, Value::Integer(i)) if *i >= 0 && *i <= u16::MAX as i64 => { + downcast_builder!(builder, UInt16Builder).append_value(*i as u16); + } + (DataType::UInt32, Value::Integer(i)) if *i >= 0 && *i <= u32::MAX as i64 => { + downcast_builder!(builder, UInt32Builder).append_value(*i as u32); + } + (DataType::UInt64, Value::Integer(i)) if *i >= 0 => { + downcast_builder!(builder, UInt64Builder).append_value(*i as u64); + } + // Float types + (DataType::Float32, Value::Float(f)) => { + downcast_builder!(builder, Float32Builder).append_value(f.into_inner() as f32); + } + (DataType::Float32, Value::Integer(i)) => { + downcast_builder!(builder, Float32Builder).append_value(*i as f32); + } + (DataType::Float64, Value::Float(f)) => { + downcast_builder!(builder, Float64Builder).append_value(f.into_inner()); + } + (DataType::Float64, Value::Integer(i)) => { + downcast_builder!(builder, Float64Builder).append_value(*i as f64); + } + // Boolean + (DataType::Boolean, Value::Boolean(b)) => { + downcast_builder!(builder, BooleanBuilder).append_value(*b); + } + // String types + (DataType::Utf8, Value::Bytes(bytes)) => match std::str::from_utf8(bytes) { + Ok(s) => downcast_builder!(builder, StringBuilder).append_value(s), + Err(_) => { + let s = String::from_utf8_lossy(bytes); + downcast_builder!(builder, StringBuilder).append_value(&s) + } + }, + // Automatic JSON serialization: Object -> String + (DataType::Utf8, Value::Object(obj)) => { + let json_str = value_to_json_string(&Value::Object(obj.clone()))?; + downcast_builder!(builder, StringBuilder).append_value(&json_str); + } + // Automatic JSON serialization: Array -> String + (DataType::Utf8, Value::Array(arr)) => { + let json_str = value_to_json_string(&Value::Array(arr.clone()))?; + downcast_builder!(builder, StringBuilder).append_value(&json_str); + } + (DataType::Binary, Value::Bytes(bytes)) => { + downcast_builder!(builder, BinaryBuilder).append_value(bytes); + } + + // Recursive types: List (Array) + (DataType::List(inner_field), Value::Array(arr)) => { + let list_builder = builder + .as_any_mut() + .downcast_mut::>>() + .ok_or_else(|| ArrowEncodingError::UnsupportedType { + field_name: field.name().clone(), + data_type: field.data_type().clone(), + })?; + + for item in arr.iter() { + append_value_to_builder(list_builder.values(), item, inner_field)?; + } + list_builder.append(true); + } + + // Recursive types: Struct (Tuple) + (DataType::Struct(fields), Value::Object(obj)) => { + let struct_builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| ArrowEncodingError::UnsupportedType { + field_name: field.name().clone(), + data_type: field.data_type().clone(), + })?; + + for (i, field) in fields.iter().enumerate() { + let key = format!("f{}", i); + let field_builder = &mut struct_builder.field_builders_mut()[i]; + match obj.get(key.as_str()) { + Some(val) => append_value_to_builder(field_builder.as_mut(), val, field)?, + None => append_null_to_builder(field_builder.as_mut(), field.data_type())?, + } + } + struct_builder.append(true); + } + + // Recursive types: Map (nested maps) + (DataType::Map(entries_field, _), Value::Object(obj)) => { + let map_builder = builder + .as_any_mut() + .downcast_mut::>>() + .ok_or_else(|| ArrowEncodingError::UnsupportedType { + field_name: field.name().clone(), + data_type: field.data_type().clone(), + })?; + + let DataType::Struct(entries_struct) = entries_field.data_type() else { + return Err(ArrowEncodingError::UnsupportedType { + field_name: field.name().clone(), + data_type: field.data_type().clone(), + }); + }; + + let value_field = &entries_struct[1]; + for (key, value) in obj.iter() { + map_builder.keys().append_value(key.as_ref()); + append_value_to_builder(map_builder.values(), value, value_field)?; + } + map_builder + .append(true) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + + // Null/missing values + _ => { + if field.is_nullable() { + append_null_to_builder(builder, field.data_type())?; + } else { + return Err(ArrowEncodingError::NullConstraint { + field_name: field.name().clone(), + }); + } + } + } + Ok(()) +} + +/// Builds a List array from events for a given field. +/// Handles all nested types (including List) through recursive builder utilities. +pub(crate) fn build_list_array( + events: &[Event], + field_name: &str, + inner_field: &Field, + nullable: bool, +) -> Result { + let inner_builder = create_array_builder_for_type( + inner_field.data_type(), + events.len() * 4, // Estimate capacity + )?; + + let mut list_builder = ListBuilder::new(inner_builder); + + for event in events { + if let Event::Log(log) = event { + match log.get(field_name) { + Some(Value::Array(arr)) => { + // Recursively append values (handles primitives, structs, maps, nested lists, etc.) + for value in arr.iter() { + append_value_to_builder(list_builder.values(), value, inner_field)?; + } + list_builder.append(true); + } + _ => { + if !nullable { + return Err(ArrowEncodingError::NullConstraint { + field_name: field_name.into(), + }); + } + list_builder.append_null(); + } + } + } + } + + Ok(Arc::new(list_builder.finish())) +} + +/// Builds a Struct array from events for a given field (used for Tuples). +pub(crate) fn build_struct_array( + events: &[Event], + field_name: &str, + fields: &Fields, + nullable: bool, +) -> Result { + // Create builders for each field + let field_builders: Vec> = fields + .iter() + .map(|f| create_array_builder_for_type(f.data_type(), events.len())) + .collect::, _>>()?; + + let mut struct_builder = StructBuilder::new(fields.clone(), field_builders); + + for event in events { + if let Event::Log(log) = event { + match log.get(field_name) { + Some(Value::Object(obj)) => { + // Tuples are represented as objects with f0, f1, f2... keys + let field_builders = struct_builder.field_builders_mut(); + for (i, (field, builder)) in + fields.iter().zip(field_builders.iter_mut()).enumerate() + { + let key = format!("f{}", i); + if let Some(value) = obj.get(key.as_str()) { + append_value_to_builder(builder.as_mut(), value, field)?; + } else { + // If the struct field is non-nullable and the value is missing, error + if !field.is_nullable() { + return Err(ArrowEncodingError::NullConstraint { + field_name: format!("{}.{}", field_name, field.name()), + }); + } + append_null_to_builder(builder.as_mut(), field.data_type())?; + } + } + struct_builder.append(true); + } + _ => { + if !nullable { + return Err(ArrowEncodingError::NullConstraint { + field_name: field_name.into(), + }); + } + // Append nulls to all field builders + let field_builders = struct_builder.field_builders_mut(); + for (field, builder) in fields.iter().zip(field_builders.iter_mut()) { + append_null_to_builder(builder.as_mut(), field.data_type())?; + } + struct_builder.append(false); + } + } + } + } + + Ok(Arc::new(struct_builder.finish())) +} + +/// Builds a Map array from events for a given field. +pub(crate) fn build_map_array( + events: &[Event], + field_name: &str, + entries_field: &Field, + nullable: bool, +) -> Result { + // Extract key and value fields from entries struct + let entries_struct = match entries_field.data_type() { + DataType::Struct(fields) => fields, + _ => { + return Err(ArrowEncodingError::UnsupportedType { + field_name: field_name.into(), + data_type: entries_field.data_type().clone(), + }); + } + }; + + if entries_struct.len() != 2 { + return Err(ArrowEncodingError::UnsupportedType { + field_name: field_name.into(), + data_type: entries_field.data_type().clone(), + }); + } + + let value_field = &entries_struct[1]; + + // Create builders for keys and values + let key_builder = StringBuilder::with_capacity(events.len() * 4, 0); + let value_builder = create_array_builder_for_type(value_field.data_type(), events.len() * 4)?; + + let mut map_builder = MapBuilder::new(None, key_builder, value_builder); + + for event in events { + if let Event::Log(log) = event { + match log.get(field_name) { + Some(Value::Object(obj)) => { + // Append each key-value pair + for (key, value) in obj.iter() { + map_builder.keys().append_value(key.as_ref()); + append_value_to_builder(map_builder.values(), value, value_field)?; + } + map_builder + .append(true) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + _ => { + if !nullable { + return Err(ArrowEncodingError::NullConstraint { + field_name: field_name.into(), + }); + } + // For null maps, we need to call append(false) + map_builder + .append(false) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + } + } + } + + Ok(Arc::new(map_builder.finish())) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ + Array, Int32Array, Int64Array, ListArray, MapArray, StringArray, StructArray, + }; + use arrow::datatypes::{DataType, Field, Fields}; + use std::sync::Arc; + use vector_core::event::{Event, LogEvent, Value}; + use vrl::value::ObjectMap; + + #[test] + fn test_build_list_array_with_primitives() { + let mut log1 = LogEvent::default(); + log1.insert( + "numbers", + Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]), + ); + + let mut log2 = LogEvent::default(); + log2.insert( + "numbers", + Value::Array(vec![Value::Integer(4), Value::Integer(5)]), + ); + + let events = vec![Event::Log(log1), Event::Log(log2)]; + + let inner_field = Field::new("item", DataType::Int64, true); + let result = build_list_array(&events, "numbers", &inner_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let list_array = array.as_any().downcast_ref::().unwrap(); + + assert_eq!(list_array.len(), 2); + assert!(!list_array.is_null(0)); + assert!(!list_array.is_null(1)); + + // Check first list [1, 2, 3] + let first_list = list_array.value(0); + let int_array = first_list.as_any().downcast_ref::().unwrap(); + assert_eq!(int_array.len(), 3); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); + + // Check second list [4, 5] + let second_list = list_array.value(1); + let int_array = second_list.as_any().downcast_ref::().unwrap(); + assert_eq!(int_array.len(), 2); + assert_eq!(int_array.value(0), 4); + assert_eq!(int_array.value(1), 5); + } + + #[test] + fn test_build_list_array_with_nulls() { + let mut log1 = LogEvent::default(); + log1.insert("numbers", Value::Array(vec![Value::Integer(1)])); + + let log2 = LogEvent::default(); // Missing field + + let mut log3 = LogEvent::default(); + log3.insert("numbers", Value::Array(vec![Value::Integer(3)])); + + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + + let inner_field = Field::new("item", DataType::Int64, true); + let result = build_list_array(&events, "numbers", &inner_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let list_array = array.as_any().downcast_ref::().unwrap(); + + assert_eq!(list_array.len(), 3); + assert!(!list_array.is_null(0)); + assert!(list_array.is_null(1)); // Missing field + assert!(!list_array.is_null(2)); + } + + #[test] + fn test_build_struct_array_with_missing_fields() { + let mut tuple = ObjectMap::new(); + tuple.insert("f0".into(), Value::Bytes("partial".into())); + // f1 is missing + + let mut log = LogEvent::default(); + log.insert("tuple", Value::Object(tuple)); + + let events = vec![Event::Log(log)]; + + let fields = Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int64, true), // Nullable + ]); + + let result = build_struct_array(&events, "tuple", &fields, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let struct_array = array.as_any().downcast_ref::().unwrap(); + + assert_eq!(struct_array.len(), 1); + + // f0 should have value + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), "partial"); + + // f1 should be null + let f1_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(f1_array.is_null(0)); + } + + #[test] + fn test_build_map_array_with_null() { + let mut map1 = ObjectMap::new(); + map1.insert("key1".into(), Value::Integer(100)); + + let mut log1 = LogEvent::default(); + log1.insert("map", Value::Object(map1)); + + let log2 = LogEvent::default(); // Missing map field + + let events = vec![Event::Log(log1), Event::Log(log2)]; + + let entries_field = Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int64, true), + ])), + false, + ); + + let result = build_map_array(&events, "map", &entries_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let map_array = array.as_any().downcast_ref::().unwrap(); + + assert_eq!(map_array.len(), 2); + assert!(!map_array.is_null(0)); + assert!(map_array.is_null(1)); + } + + #[test] + fn test_build_map_array_empty_map() { + let mut log = LogEvent::default(); + log.insert("map", Value::Object(ObjectMap::new())); // Empty map + + let events = vec![Event::Log(log)]; + + let entries_field = Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int64, true), + ])), + false, + ); + + let result = build_map_array(&events, "map", &entries_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let map_array = array.as_any().downcast_ref::().unwrap(); + + assert_eq!(map_array.len(), 1); + assert!(!map_array.is_null(0)); + + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 0); // Empty but not null + } + + #[test] + fn test_json_serialization_object_to_string() { + let mut obj = ObjectMap::new(); + obj.insert("name".into(), Value::Bytes("test".into())); + obj.insert("count".into(), Value::Integer(42)); + + let mut log = LogEvent::default(); + log.insert("data", Value::Array(vec![Value::Object(obj)])); + + let events = vec![Event::Log(log)]; + + // Schema expects List + let inner_field = Field::new("item", DataType::Utf8, true); + let result = build_list_array(&events, "data", &inner_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let list_array = array.as_any().downcast_ref::().unwrap(); + + let values = list_array.value(0); + let string_array = values.as_any().downcast_ref::().unwrap(); + let json_str = string_array.value(0); + + // Should be JSON serialized + assert!(json_str.contains("\"name\"")); + assert!(json_str.contains("test")); + assert!(json_str.contains("\"count\"")); + assert!(json_str.contains("42")); + } + + #[test] + fn test_json_serialization_array_to_string() { + let mut log = LogEvent::default(); + log.insert( + "data", + Value::Array(vec![Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ])]), + ); + + let events = vec![Event::Log(log)]; + + // Schema expects List + let inner_field = Field::new("item", DataType::Utf8, true); + let result = build_list_array(&events, "data", &inner_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let list_array = array.as_any().downcast_ref::().unwrap(); + + let values = list_array.value(0); + let string_array = values.as_any().downcast_ref::().unwrap(); + let json_str = string_array.value(0); + + // Should be JSON serialized array + assert_eq!(json_str, "[1,2,3]"); + } + + #[test] + fn test_nested_list_of_structs() { + let mut tuple1 = ObjectMap::new(); + tuple1.insert("f0".into(), Value::Integer(1)); + tuple1.insert("f1".into(), Value::Bytes("a".into())); + + let mut tuple2 = ObjectMap::new(); + tuple2.insert("f0".into(), Value::Integer(2)); + tuple2.insert("f1".into(), Value::Bytes("b".into())); + + let mut log = LogEvent::default(); + log.insert( + "data", + Value::Array(vec![Value::Object(tuple1), Value::Object(tuple2)]), + ); + + let events = vec![Event::Log(log)]; + + let struct_fields = Fields::from(vec![ + Field::new("f0", DataType::Int32, true), + Field::new("f1", DataType::Utf8, true), + ]); + + let inner_field = Field::new("item", DataType::Struct(struct_fields), true); + let result = build_list_array(&events, "data", &inner_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let list_array = array.as_any().downcast_ref::().unwrap(); + + let values = list_array.value(0); + let struct_array = values.as_any().downcast_ref::().unwrap(); + + assert_eq!(struct_array.len(), 2); + + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), 1); + assert_eq!(f0_array.value(1), 2); + + let f1_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f1_array.value(0), "a"); + assert_eq!(f1_array.value(1), "b"); + } + + #[test] + fn test_nested_struct_with_list() { + let mut tuple = ObjectMap::new(); + tuple.insert("f0".into(), Value::Bytes("name".into())); + tuple.insert( + "f1".into(), + Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]), + ); + + let mut log = LogEvent::default(); + log.insert("data", Value::Object(tuple)); + + let events = vec![Event::Log(log)]; + + let fields = Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new( + "f1", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + true, + ), + ]); + + let result = build_struct_array(&events, "data", &fields, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let struct_array = array.as_any().downcast_ref::().unwrap(); + + // Check f0 (string) + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), "name"); + + // Check f1 (list) + let f1_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let list_values = f1_array.value(0); + let int_array = list_values.as_any().downcast_ref::().unwrap(); + assert_eq!(int_array.len(), 3); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); + } + + #[test] + fn test_nested_map_with_struct_values() { + let mut struct_value = ObjectMap::new(); + struct_value.insert("f0".into(), Value::Integer(42)); + struct_value.insert("f1".into(), Value::Bytes("test".into())); + + let mut map = ObjectMap::new(); + map.insert("key1".into(), Value::Object(struct_value)); + + let mut log = LogEvent::default(); + log.insert("data", Value::Object(map)); + + let events = vec![Event::Log(log)]; + + let struct_fields = Fields::from(vec![ + Field::new("f0", DataType::Int64, true), + Field::new("f1", DataType::Utf8, true), + ]); + + let entries_field = Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Struct(struct_fields), true), + ])), + false, + ); + + let result = build_map_array(&events, "data", &entries_field, true); + + assert!(result.is_ok()); + let array = result.unwrap(); + let map_array = array.as_any().downcast_ref::().unwrap(); + + assert_eq!(map_array.len(), 1); + assert!(!map_array.is_null(0)); + + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 1); + + // Verify struct values + let struct_array = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), 42); + + let f1_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f1_array.value(0), "test"); + } +} diff --git a/lib/codecs/src/encoding/format/arrow/types/mod.rs b/lib/codecs/src/encoding/format/arrow/types/mod.rs index f55c958f6d740..625855cf2ad3c 100644 --- a/lib/codecs/src/encoding/format/arrow/types/mod.rs +++ b/lib/codecs/src/encoding/format/arrow/types/mod.rs @@ -1,7 +1,16 @@ +use arrow::array::{ + ArrayBuilder, ListBuilder, MapBuilder, StringBuilder, StructBuilder, make_builder, +}; +use arrow::datatypes::DataType; + +use super::ArrowEncodingError; + +mod complex; mod decimal; mod primitives; mod temporal; +pub(crate) use complex::{build_list_array, build_map_array, build_struct_array}; pub(crate) use decimal::{build_decimal128_array, build_decimal256_array}; pub(crate) use primitives::{ build_binary_array, build_boolean_array, build_float32_array, build_float64_array, @@ -9,3 +18,70 @@ pub(crate) use primitives::{ build_uint8_array, build_uint16_array, build_uint32_array, build_uint64_array, }; pub(crate) use temporal::build_timestamp_array; + +const NESTED_CAPACITY_MULTIPLIER: usize = 4; + +/// Creates an array builder for a given Arrow data type. +/// +/// Uses Arrow's `make_builder` for most types, but provides custom handling +/// for complex nested types (List, Struct, Map) to ensure proper recursive +/// builder creation, especially for nested Maps which `make_builder` doesn't +/// fully support. +pub(crate) fn create_array_builder_for_type( + data_type: &DataType, + capacity: usize, +) -> Result, ArrowEncodingError> { + match data_type { + DataType::List(inner_field) => { + create_list_builder(inner_field.data_type(), capacity) + } + DataType::Struct(fields) => { + create_struct_builder(fields, capacity) + } + DataType::Map(entries_field, _) => { + create_map_builder(entries_field.data_type(), capacity) + } + _ => Ok(make_builder(data_type, capacity)), + } +} + +fn create_list_builder( + inner_type: &DataType, + capacity: usize, +) -> Result, ArrowEncodingError> { + let nested_capacity = capacity * NESTED_CAPACITY_MULTIPLIER; + let inner_builder = create_array_builder_for_type(inner_type, nested_capacity)?; + Ok(Box::new(ListBuilder::new(inner_builder))) +} + +fn create_struct_builder( + fields: &arrow::datatypes::Fields, + capacity: usize, +) -> Result, ArrowEncodingError> { + let field_builders = fields + .iter() + .map(|f| create_array_builder_for_type(f.data_type(), capacity)) + .collect::, _>>()?; + Ok(Box::new(StructBuilder::new(fields.clone(), field_builders))) +} + +fn create_map_builder( + entries_type: &DataType, + capacity: usize, +) -> Result, ArrowEncodingError> { + let DataType::Struct(entries_fields) = entries_type else { + return Err(ArrowEncodingError::UnsupportedType { + field_name: "dynamic".into(), + data_type: entries_type.clone(), + }); + }; + + let nested_capacity = capacity * NESTED_CAPACITY_MULTIPLIER; + let key_builder = StringBuilder::with_capacity(nested_capacity, 0); + let value_builder = create_array_builder_for_type( + entries_fields[1].data_type(), + nested_capacity, + )?; + + Ok(Box::new(MapBuilder::new(None, key_builder, value_builder))) +} diff --git a/src/sinks/clickhouse/arrow/parser.rs b/src/sinks/clickhouse/arrow/parser.rs index a13bd823487b5..506a228e434b3 100644 --- a/src/sinks/clickhouse/arrow/parser.rs +++ b/src/sinks/clickhouse/arrow/parser.rs @@ -1,6 +1,7 @@ //! ClickHouse type parsing and conversion to Arrow types. -use arrow::datatypes::{DataType, TimeUnit}; +use arrow::datatypes::{DataType, Field, Fields, TimeUnit}; +use std::sync::Arc; const DECIMAL32_PRECISION: u8 = 9; const DECIMAL64_PRECISION: u8 = 18; @@ -16,6 +17,12 @@ pub enum ClickHouseType<'a> { Nullable(Box>), /// LowCardinality(T) LowCardinality(Box>), + /// Array(T) + Array(Box>), + /// Tuple(T1, T2, ...) + Tuple(Vec>), + /// Map(K, V) + Map(Box>, Box>), } impl<'a> ClickHouseType<'a> { @@ -38,157 +45,196 @@ impl<'a> ClickHouseType<'a> { _ => self, } } + + /// Converts this structured ClickHouseType to an Arrow DataType. + /// Returns a tuple of (DataType, is_nullable). + pub fn to_arrow(&self) -> Result<(DataType, bool), String> { + let is_nullable = self.is_nullable(); + + match self.base_type() { + ClickHouseType::Primitive(name) => { + let (type_name, _) = extract_identifier(name); + let data_type = match type_name { + // Numeric + "Int8" => DataType::Int8, + "Int16" => DataType::Int16, + "Int32" => DataType::Int32, + "Int64" => DataType::Int64, + "UInt8" => DataType::UInt8, + "UInt16" => DataType::UInt16, + "UInt32" => DataType::UInt32, + "UInt64" => DataType::UInt64, + "Float32" => DataType::Float32, + "Float64" => DataType::Float64, + "Bool" => DataType::Boolean, + "Decimal" | "Decimal32" | "Decimal64" | "Decimal128" | "Decimal256" => { + parse_decimal_type(name)? + } + + // Strings + "String" | "FixedString" => DataType::Utf8, + + // Date and time types (timezones not currently handled, defaults to UTC) + "Date" | "Date32" => DataType::Date32, + "DateTime" => DataType::Timestamp(TimeUnit::Second, None), + "DateTime64" => parse_datetime64_precision(name)?, + + // Unknown + _ => { + return Err(format!( + "Unknown ClickHouse type '{}'. This type cannot be automatically converted.", + type_name + )); + } + }; + Ok((data_type, is_nullable)) + } + ClickHouseType::Array(inner) => { + let (inner_arrow, inner_nullable) = inner.to_arrow()?; + let field = Field::new("item", inner_arrow, inner_nullable); + Ok((DataType::List(Arc::new(field)), is_nullable)) + } + ClickHouseType::Tuple(elements) => { + let fields: Result, String> = elements + .iter() + .enumerate() + .map(|(i, elem)| { + let (elem_arrow, elem_nullable) = elem.to_arrow()?; + Ok(Field::new(format!("f{}", i), elem_arrow, elem_nullable)) + }) + .collect(); + Ok((DataType::Struct(Fields::from(fields?)), is_nullable)) + } + ClickHouseType::Map(key_type, value_type) => { + // Validate key is String + let (key_arrow, _) = key_type.to_arrow()?; + if !matches!(key_arrow, DataType::Utf8) { + return Err( + "Map keys must be String type. Vector's ObjectMap only supports String keys." + .to_string(), + ); + } + + // Recursively convert value type + let (value_arrow, value_nullable) = value_type.to_arrow()?; + + // Arrow Map is represented as Map + let key_field = Field::new("keys", DataType::Utf8, false); + let value_field = Field::new("values", value_arrow, value_nullable); + let entries_struct = DataType::Struct(Fields::from(vec![key_field, value_field])); + let entries_field = Field::new("entries", entries_struct, false); + Ok((DataType::Map(Arc::new(entries_field), false), is_nullable)) + } + _ => Err("Unsupported ClickHouse type".to_string()), + } + } } /// Parses a ClickHouse type string into a structured representation. pub fn parse_ch_type(ty: &str) -> ClickHouseType<'_> { let ty = ty.trim(); - // Recursively strip and parse type modifiers - if let Some(inner) = strip_wrapper(ty, "Nullable") { - return ClickHouseType::Nullable(Box::new(parse_ch_type(inner))); - } - if let Some(inner) = strip_wrapper(ty, "LowCardinality") { - return ClickHouseType::LowCardinality(Box::new(parse_ch_type(inner))); + // Try to match type_name(args) pattern + if let Some((type_name, args_str)) = try_parse_wrapper(ty) { + match type_name { + "Nullable" => { + return ClickHouseType::Nullable(Box::new(parse_ch_type(args_str))); + } + "LowCardinality" => { + return ClickHouseType::LowCardinality(Box::new(parse_ch_type(args_str))); + } + "Array" => { + return ClickHouseType::Array(Box::new(parse_ch_type(args_str))); + } + "Tuple" => { + let elements = parse_args(args_str) + .into_iter() + .map(|arg| parse_ch_type(arg)) + .collect(); + return ClickHouseType::Tuple(elements); + } + "Map" => { + let args = parse_args(args_str); + if args.len() == 2 { + return ClickHouseType::Map( + Box::new(parse_ch_type(args[0])), + Box::new(parse_ch_type(args[1])), + ); + } + } + _ => {} // Fall through to primitive + } } - // Base case: return primitive type for anything without modifiers + // Base case: return primitive type ClickHouseType::Primitive(ty) } -/// Helper function to strip a wrapper from a type string. -/// Returns the inner content if the type matches the wrapper pattern. -fn strip_wrapper<'a>(ty: &'a str, wrapper_name: &str) -> Option<&'a str> { - ty.strip_prefix(wrapper_name)? - .trim_start() - .strip_prefix('(')? - .strip_suffix(')') -} - -/// Unwraps ClickHouse type modifiers like Nullable() and LowCardinality(). -/// Returns a tuple of (base_type, is_nullable). -/// For example: "LowCardinality(Nullable(String))" -> ("String", true) -pub fn unwrap_type_modifiers(ch_type: &str) -> (&str, bool) { - let parsed = parse_ch_type(ch_type); - let is_nullable = parsed.is_nullable(); - - match parsed.base_type() { - ClickHouseType::Primitive(base) => (base, is_nullable), - _ => (ch_type, is_nullable), +/// Tries to parse "TypeName(args)" into ("TypeName", "args"). +fn try_parse_wrapper(ty: &str) -> Option<(&str, &str)> { + let paren_pos = ty.find('(')?; + if !ty.ends_with(')') { + return None; } -} -fn unsupported(ch_type: &str, kind: &str) -> String { - format!( - "{kind} type '{ch_type}' is not supported. \ - ClickHouse {kind} types cannot be automatically converted to Arrow format." - ) -} + let type_name = ty[..paren_pos].trim(); + let args = &ty[paren_pos + 1..ty.len() - 1]; -/// Converts a ClickHouse type string to an Arrow DataType. -/// Returns a tuple of (DataType, is_nullable). -pub fn clickhouse_type_to_arrow(ch_type: &str) -> Result<(DataType, bool), String> { - let (base_type, is_nullable) = unwrap_type_modifiers(ch_type); - let (type_name, _) = extract_identifier(base_type); - - let data_type = match type_name { - // Numeric - "Int8" => DataType::Int8, - "Int16" => DataType::Int16, - "Int32" => DataType::Int32, - "Int64" => DataType::Int64, - "UInt8" => DataType::UInt8, - "UInt16" => DataType::UInt16, - "UInt32" => DataType::UInt32, - "UInt64" => DataType::UInt64, - "Float32" => DataType::Float32, - "Float64" => DataType::Float64, - "Bool" => DataType::Boolean, - "Decimal" | "Decimal32" | "Decimal64" | "Decimal128" | "Decimal256" => { - parse_decimal_type(base_type)? - } - - // Strings - "String" | "FixedString" => DataType::Utf8, - - // Date and time types (timezones not currently handled, defaults to UTC) - "Date" | "Date32" => DataType::Date32, - "DateTime" => DataType::Timestamp(TimeUnit::Second, None), - "DateTime64" => parse_datetime64_precision(base_type)?, - - // Unsupported - "Array" => return Err(unsupported(ch_type, "Array")), - "Tuple" => return Err(unsupported(ch_type, "Tuple")), - "Map" => return Err(unsupported(ch_type, "Map")), - - // Unknown - _ => { - return Err(format!( - "Unknown ClickHouse type '{}'. This type cannot be automatically converted.", - type_name - )); - } - }; - - Ok((data_type, is_nullable)) + Some((type_name, args)) } -/// Extracts an identifier from the start of a string. -/// Returns (identifier, remaining_string). -fn extract_identifier(input: &str) -> (&str, &str) { - for (i, c) in input.char_indices() { - if c.is_alphabetic() || c == '_' || (i > 0 && c.is_numeric()) { - continue; - } - return (&input[..i], &input[i..]); - } - (input, "") -} - -/// Parses comma-separated arguments from a parenthesized string. -/// Input: "(arg1, arg2, arg3)" -> Output: Ok(vec!["arg1".to_string(), "arg2".to_string(), "arg3".to_string()]) -/// Returns an error if parentheses are malformed. -fn parse_args(input: &str) -> Result, String> { - let trimmed = input.trim(); - if !trimmed.starts_with('(') || !trimmed.ends_with(')') { - return Err(format!( - "Expected parentheses around arguments in '{}'", - input - )); - } +/// Parses comma-separated arguments, respecting nesting and quotes. +/// Handles input with or without surrounding parentheses. +/// Examples: "Int32, String" or "(Int32, String)" both work. +/// Parses comma-separated arguments, respecting nesting and quotes. +/// Handles input with or without surrounding parentheses. +/// Examples: "Int32, String" or "(Int32, String)" both work. +fn parse_args(input: &str) -> Vec<&str> { + let input = input.trim(); + + // Strip parentheses if present + let input = if input.starts_with('(') && input.ends_with(')') { + &input[1..input.len() - 1] + } else { + input + }; - let inner = trimmed[1..trimmed.len() - 1].trim(); - if inner.is_empty() { - return Ok(vec![]); + if input.is_empty() { + return vec![]; } - // Split by comma, handling nested parentheses and quotes let mut args = Vec::new(); - let mut current_arg = String::new(); + let mut start = 0; let mut depth = 0; let mut in_quotes = false; - for c in inner.chars() { + for (i, c) in input.char_indices() { match c { - '\'' if !in_quotes => in_quotes = true, - '\'' if in_quotes => in_quotes = false, + '\'' => in_quotes = !in_quotes, '(' if !in_quotes => depth += 1, ')' if !in_quotes => depth -= 1, ',' if depth == 0 && !in_quotes => { - args.push(current_arg.trim().to_string()); - current_arg = String::new(); - continue; + args.push(input[start..i].trim()); + start = i + 1; } _ => {} } - current_arg.push(c); } - if !current_arg.trim().is_empty() { - args.push(current_arg.trim().to_string()); - } + args.push(input[start..].trim()); + args +} - Ok(args) +/// Extracts an identifier from the start of a string. +/// Returns (identifier, remaining_string). +fn extract_identifier(input: &str) -> (&str, &str) { + for (i, c) in input.char_indices() { + if c.is_alphabetic() || c == '_' || (i > 0 && c.is_numeric()) { + continue; + } + return (&input[..i], &input[i..]); + } + (input, "") } /// Parses ClickHouse Decimal types and returns the appropriate Arrow decimal type. @@ -204,7 +250,8 @@ fn parse_decimal_type(ch_type: &str) -> Result { // Parse from type string let (type_name, args_str) = extract_identifier(ch_type); - let result = parse_args(args_str).ok().and_then(|args| match type_name { + let args = parse_args(args_str); + let result = match type_name { "Decimal" if args.len() == 2 => args[0].parse::().ok().zip(args[1].parse::().ok()), "Decimal32" | "Decimal64" | "Decimal128" | "Decimal256" if args.len() == 1 => { args[0].parse::().ok().map(|scale| { @@ -219,7 +266,7 @@ fn parse_decimal_type(ch_type: &str) -> Result { }) } _ => None, - }); + }; result .map(|(precision, scale)| { @@ -242,12 +289,7 @@ fn parse_datetime64_precision(ch_type: &str) -> Result { // Parse from type string let (_type_name, args_str) = extract_identifier(ch_type); - let args = parse_args(args_str).map_err(|e| { - format!( - "Could not parse DateTime64 arguments from '{}': {}. Expected format: DateTime64(0-9) or DateTime64(0-9, 'timezone')", - ch_type, e - ) - })?; + let args = parse_args(args_str); // DateTime64(precision) or DateTime64(precision, 'timezone') if args.is_empty() { @@ -276,7 +318,7 @@ mod tests { // Helper function for tests that don't need metadata fn convert_type_no_metadata(ch_type: &str) -> Result<(DataType, bool), String> { - clickhouse_type_to_arrow(ch_type) + parse_ch_type(ch_type).to_arrow() } #[test] @@ -502,82 +544,87 @@ mod tests { #[test] fn test_parse_args() { - // Simple cases - assert_eq!( - parse_args("(10, 2)").unwrap(), - vec!["10".to_string(), "2".to_string()] - ); - assert_eq!(parse_args("(3)").unwrap(), vec!["3".to_string()]); - assert_eq!(parse_args("()").unwrap(), Vec::::new()); + // Simple cases with parentheses + assert_eq!(parse_args("(10, 2)"), vec!["10", "2"]); + assert_eq!(parse_args("(3)"), vec!["3"]); + assert_eq!(parse_args("()"), Vec::<&str>::new()); + + // Simple cases without parentheses (now supported) + assert_eq!(parse_args("10, 2"), vec!["10", "2"]); + assert_eq!(parse_args("3"), vec!["3"]); // With spaces - assert_eq!( - parse_args("( 10 , 2 )").unwrap(), - vec!["10".to_string(), "2".to_string()] - ); + assert_eq!(parse_args("( 10 , 2 )"), vec!["10", "2"]); // With nested parentheses + assert_eq!(parse_args("(Nullable(String))"), vec!["Nullable(String)"]); assert_eq!( - parse_args("(Nullable(String))").unwrap(), - vec!["Nullable(String)".to_string()] - ); - assert_eq!( - parse_args("(Array(Int32), String)").unwrap(), - vec!["Array(Int32)".to_string(), "String".to_string()] + parse_args("(Array(Int32), String)"), + vec!["Array(Int32)", "String"] ); // With quotes + assert_eq!(parse_args("(3, 'UTC')"), vec!["3", "'UTC'"]); assert_eq!( - parse_args("(3, 'UTC')").unwrap(), - vec!["3".to_string(), "'UTC'".to_string()] - ); - assert_eq!( - parse_args("(9, 'America/New_York')").unwrap(), - vec!["9".to_string(), "'America/New_York'".to_string()] + parse_args("(9, 'America/New_York')"), + vec!["9", "'America/New_York'"] ); - // Complex nested case + // Complex nested case with multiple levels, modifiers, named tuples, and quotes assert_eq!( - parse_args("(Tuple(Int32, String), Array(Float64))").unwrap(), + parse_args( + "(Array(Tuple(id Int64, tags Array(String))), Map(String, Tuple(Nullable(Float64), LowCardinality(String))), String, DateTime('America/New_York'))" + ), vec![ - "Tuple(Int32, String)".to_string(), - "Array(Float64)".to_string() + "Array(Tuple(id Int64, tags Array(String)))", + "Map(String, Tuple(Nullable(Float64), LowCardinality(String)))", + "String", + "DateTime('America/New_York')" ] ); - - // Error cases - assert!(parse_args("10, 2").is_err()); // Missing parentheses - assert!(parse_args("(10, 2").is_err()); // Missing closing paren } #[test] - fn test_array_type_not_supported() { - // Array types should return an error + fn test_array_type() { let result = convert_type_no_metadata("Array(Int32)"); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.contains("Array type")); - assert!(err.contains("not supported")); + assert!(result.is_ok()); + let (data_type, is_nullable) = result.unwrap(); + assert!(!is_nullable); + match data_type { + DataType::List(field) => { + assert_eq!(field.data_type(), &DataType::Int32); + assert!(!field.is_nullable()); + } + _ => panic!("Expected List type"), + } } #[test] - fn test_tuple_type_not_supported() { - // Tuple types should return an error + fn test_tuple_type() { let result = convert_type_no_metadata("Tuple(String, Int64)"); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.contains("Tuple type")); - assert!(err.contains("not supported")); + assert!(result.is_ok()); + let (data_type, is_nullable) = result.unwrap(); + assert!(!is_nullable); + match data_type { + DataType::Struct(fields) => { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].data_type(), &DataType::Utf8); + assert_eq!(fields[1].data_type(), &DataType::Int64); + } + _ => panic!("Expected Struct type"), + } } #[test] - fn test_map_type_not_supported() { - // Map types should return an error + fn test_map_type() { let result = convert_type_no_metadata("Map(String, Int64)"); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.contains("Map type")); - assert!(err.contains("not supported")); + assert!(result.is_ok()); + let (data_type, is_nullable) = result.unwrap(); + assert!(!is_nullable); + match data_type { + DataType::Map(_, _) => {} + _ => panic!("Expected Map type"), + } } #[test] @@ -644,4 +691,120 @@ mod tests { let parsed = parse_ch_type("String"); assert_eq!(parsed.base_type(), &ClickHouseType::Primitive("String")); } + + #[test] + fn test_array_type_parsing() { + // Simple array + let result = convert_type_no_metadata("Array(Int32)"); + assert!(result.is_ok()); + let (dtype, nullable) = result.unwrap(); + assert!(matches!(dtype, DataType::List(_))); + assert!(!nullable); + + // Nested array + let result = convert_type_no_metadata("Array(Array(String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::List(inner) = dtype { + assert!(matches!(inner.data_type(), DataType::List(_))); + } else { + panic!("Expected List type"); + } + + // Nullable array + let result = convert_type_no_metadata("Nullable(Array(Int64))"); + assert!(result.is_ok()); + let (_, nullable) = result.unwrap(); + assert!(nullable); + } + + #[test] + fn test_tuple_type_parsing() { + // Simple tuple + let result = convert_type_no_metadata("Tuple(String, Int64)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "f0"); + assert_eq!(fields[1].name(), "f1"); + assert!(matches!(fields[0].data_type(), DataType::Utf8)); + assert!(matches!(fields[1].data_type(), DataType::Int64)); + } else { + panic!("Expected Struct type"); + } + + // Nested tuple + let result = convert_type_no_metadata("Tuple(Int32, Tuple(String, Float64))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert!(matches!(fields[1].data_type(), DataType::Struct(_))); + } else { + panic!("Expected Struct type"); + } + } + + #[test] + fn test_map_type_parsing() { + // Simple map + let result = convert_type_no_metadata("Map(String, Int64)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + assert!(matches!(dtype, DataType::Map(_, _))); + + // Map with complex value + let result = convert_type_no_metadata("Map(String, Array(Int32))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Map(entries, _) = dtype + && let DataType::Struct(fields) = entries.data_type() + { + let value_field = &fields[1]; + assert!(matches!(value_field.data_type(), DataType::List(_))); + } + + // Non-string key should error + let result = convert_type_no_metadata("Map(Int32, String)"); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.contains("Map keys must be String")); + } + + #[test] + fn test_complex_nested_types() { + // Array of tuples + let result = convert_type_no_metadata("Array(Tuple(String, Int64))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::List(inner) = dtype { + assert!(matches!(inner.data_type(), DataType::Struct(_))); + } else { + panic!("Expected List type"); + } + + // Tuple with array and map + let result = convert_type_no_metadata("Tuple(Array(Int32), Map(String, Float64))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert!(matches!(fields[0].data_type(), DataType::List(_))); + assert!(matches!(fields[1].data_type(), DataType::Map(_, _))); + } else { + panic!("Expected Struct type"); + } + + // Map with tuple values + let result = convert_type_no_metadata("Map(String, Tuple(Int64, String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Map(entries, _) = dtype + && let DataType::Struct(fields) = entries.data_type() + { + let value_field = &fields[1]; + assert!(matches!(value_field.data_type(), DataType::Struct(_))); + } + } } diff --git a/src/sinks/clickhouse/arrow/schema.rs b/src/sinks/clickhouse/arrow/schema.rs index f2359ca5f3519..fd9842b1bb371 100644 --- a/src/sinks/clickhouse/arrow/schema.rs +++ b/src/sinks/clickhouse/arrow/schema.rs @@ -9,7 +9,7 @@ use vector_lib::codecs::encoding::format::{ArrowEncodingError, SchemaProvider}; use crate::http::{Auth, HttpClient}; -use super::parser::clickhouse_type_to_arrow; +use super::parser::parse_ch_type; #[derive(Debug, Deserialize)] struct ColumnInfo { @@ -87,7 +87,8 @@ fn parse_schema_from_response(response: &str) -> crate::Result { let mut fields = Vec::new(); for column in columns { - let (arrow_type, nullable) = clickhouse_type_to_arrow(&column.column_type) + let (arrow_type, nullable) = parse_ch_type(&column.column_type) + .to_arrow() .map_err(|e| format!("Failed to convert column '{}': {}", column.name, e))?; fields.push(Field::new(&column.name, arrow_type, nullable)); } diff --git a/src/sinks/clickhouse/integration_tests.rs b/src/sinks/clickhouse/integration_tests.rs index 3798595708b41..bdcde12569dad 100644 --- a/src/sinks/clickhouse/integration_tests.rs +++ b/src/sinks/clickhouse/integration_tests.rs @@ -12,11 +12,12 @@ use futures::{ stream, }; use http::StatusCode; +use ordered_float::NotNan; use serde::Deserialize; use serde_json::Value; use tokio::time::{Duration, timeout}; use vector_lib::{ - codecs::encoding::BatchSerializerConfig, + codecs::encoding::{ArrowStreamSerializerConfig, BatchSerializerConfig}, event::{BatchNotifier, BatchStatus, BatchStatusReceiver, Event, LogEvent}, lookup::PathPrefix, }; @@ -605,3 +606,525 @@ async fn insert_events_arrow_with_schema_fetching() { assert!(row.get("active").and_then(|v| v.as_bool()).is_some()); } } + +#[tokio::test] +async fn test_complex_types() { + trace_init(); + + let table = random_table_name(); + let host = clickhouse_address(); + + let mut batch = BatchConfig::default(); + batch.max_events = Some(3); + + let arrow_config = ArrowStreamSerializerConfig { + allow_nullable_fields: true, + ..Default::default() + }; + + let config = ClickhouseConfig { + endpoint: host.parse().unwrap(), + table: table.clone().try_into().unwrap(), + compression: Compression::None, + format: crate::sinks::clickhouse::config::Format::ArrowStream, + batch_encoding: Some(BatchSerializerConfig::ArrowStream(arrow_config)), + batch, + request: TowerRequestConfig { + retry_attempts: 1, + ..Default::default() + }, + ..Default::default() + }; + + let client = ClickhouseClient::new(host); + + // Comprehensive schema with all complex types + client + .create_table( + &table, + "host String, timestamp DateTime64(3), message String, \ + nested_int_array Array(Array(Int32)), \ + nested_string_array Array(Array(String)), \ + array_map Map(String, Array(String)), \ + int_array_map Map(String, Array(Int64)), \ + tuple_with_array Tuple(String, Array(Int32)), \ + tuple_with_map Tuple(String, Map(String, Float64)), \ + tuple_with_nested Tuple(String, Array(Int32), Map(String, Float64)), \ + locations Array(Tuple(String, Float64, Float64)), \ + tags_history Array(Map(String, String)), \ + metrics_history Array(Map(String, Int32)), \ + request_headers Map(String, String), \ + response_metrics Tuple(Int32, Int64, Float64), \ + tags Array(String), \ + user_properties Map(String, Array(String)), \ + array_with_nulls Array(Nullable(Int32))", + ) + .await; + + let (sink, _hc) = config.build(SinkContext::default()).await.unwrap(); + + let mut events: Vec = Vec::new(); + + // Event 1: Comprehensive test with all complex types + let mut event1 = LogEvent::from("Comprehensive complex types test"); + event1.insert("host", "host1.example.com"); + + // Nested arrays + event1.insert( + "nested_int_array", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(1), + vector_lib::event::Value::Integer(2), + ]), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(3), + vector_lib::event::Value::Integer(4), + ]), + ]), + ); + event1.insert( + "nested_string_array", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("a".into()), + vector_lib::event::Value::Bytes("b".into()), + ])]), + ); + + // Maps with arrays + let mut array_map = vector_lib::event::ObjectMap::new(); + array_map.insert( + "fruits".into(), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("apple".into()), + vector_lib::event::Value::Bytes("banana".into()), + ]), + ); + event1.insert("array_map", vector_lib::event::Value::Object(array_map)); + + let mut int_array_map = vector_lib::event::ObjectMap::new(); + int_array_map.insert( + "scores".into(), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(95), + vector_lib::event::Value::Integer(87), + ]), + ); + event1.insert( + "int_array_map", + vector_lib::event::Value::Object(int_array_map), + ); + + // Tuples with complex types + let mut tuple_with_array = vector_lib::event::ObjectMap::new(); + tuple_with_array.insert( + "f0".into(), + vector_lib::event::Value::Bytes("numbers".into()), + ); + tuple_with_array.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(10), + vector_lib::event::Value::Integer(20), + ]), + ); + event1.insert( + "tuple_with_array", + vector_lib::event::Value::Object(tuple_with_array), + ); + + let mut inner_map = vector_lib::event::ObjectMap::new(); + inner_map.insert( + "temp".into(), + vector_lib::event::Value::Float(NotNan::new(22.5).unwrap()), + ); + let mut tuple_with_map = vector_lib::event::ObjectMap::new(); + tuple_with_map.insert( + "f0".into(), + vector_lib::event::Value::Bytes("metrics".into()), + ); + tuple_with_map.insert("f1".into(), vector_lib::event::Value::Object(inner_map)); + event1.insert( + "tuple_with_map", + vector_lib::event::Value::Object(tuple_with_map), + ); + + let mut inner_map2 = vector_lib::event::ObjectMap::new(); + inner_map2.insert( + "avg".into(), + vector_lib::event::Value::Float(NotNan::new(95.5).unwrap()), + ); + let mut tuple_complex = vector_lib::event::ObjectMap::new(); + tuple_complex.insert( + "f0".into(), + vector_lib::event::Value::Bytes("results".into()), + ); + tuple_complex.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(95)]), + ); + tuple_complex.insert("f2".into(), vector_lib::event::Value::Object(inner_map2)); + event1.insert( + "tuple_with_nested", + vector_lib::event::Value::Object(tuple_complex), + ); + + // Array of tuples + let mut loc1 = vector_lib::event::ObjectMap::new(); + loc1.insert( + "f0".into(), + vector_lib::event::Value::Bytes("San Francisco".into()), + ); + loc1.insert( + "f1".into(), + vector_lib::event::Value::Float(NotNan::new(37.7749).unwrap()), + ); + loc1.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(-122.4194).unwrap()), + ); + event1.insert( + "locations", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(loc1)]), + ); + + // Array of maps + let mut tags1 = vector_lib::event::ObjectMap::new(); + tags1.insert("env".into(), vector_lib::event::Value::Bytes("prod".into())); + event1.insert( + "tags_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(tags1)]), + ); + + let mut metrics1 = vector_lib::event::ObjectMap::new(); + metrics1.insert("cpu".into(), vector_lib::event::Value::Integer(45)); + event1.insert( + "metrics_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(metrics1)]), + ); + + // Structured log data + let mut headers = vector_lib::event::ObjectMap::new(); + headers.insert( + "user-agent".into(), + vector_lib::event::Value::Bytes("Mozilla/5.0".into()), + ); + event1.insert("request_headers", vector_lib::event::Value::Object(headers)); + + let mut metrics = vector_lib::event::ObjectMap::new(); + metrics.insert("f0".into(), vector_lib::event::Value::Integer(200)); + metrics.insert("f1".into(), vector_lib::event::Value::Integer(1234)); + metrics.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(0.145).unwrap()), + ); + event1.insert( + "response_metrics", + vector_lib::event::Value::Object(metrics), + ); + + event1.insert( + "tags", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("api".into()), + vector_lib::event::Value::Bytes("v2".into()), + ]), + ); + + let mut user_props = vector_lib::event::ObjectMap::new(); + user_props.insert( + "roles".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("admin".into())]), + ); + event1.insert( + "user_properties", + vector_lib::event::Value::Object(user_props), + ); + + // Nullable array + event1.insert( + "array_with_nulls", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Integer(100), + vector_lib::event::Value::Integer(200), + ]), + ); + + events.push(event1.into()); + + // Event 2: Empty and edge cases + let mut event2 = LogEvent::from("Test empty collections"); + event2.insert("host", "host2.example.com"); + event2.insert("nested_int_array", vector_lib::event::Value::Array(vec![])); + event2.insert( + "nested_string_array", + vector_lib::event::Value::Array(vec![]), + ); + + let empty_map = vector_lib::event::ObjectMap::new(); + event2.insert( + "array_map", + vector_lib::event::Value::Object(empty_map.clone()), + ); + event2.insert( + "int_array_map", + vector_lib::event::Value::Object(empty_map.clone()), + ); + + let mut empty_tuple = vector_lib::event::ObjectMap::new(); + empty_tuple.insert("f0".into(), vector_lib::event::Value::Bytes("empty".into())); + empty_tuple.insert("f1".into(), vector_lib::event::Value::Array(vec![])); + event2.insert( + "tuple_with_array", + vector_lib::event::Value::Object(empty_tuple), + ); + + let mut empty_tuple_map = vector_lib::event::ObjectMap::new(); + empty_tuple_map.insert("f0".into(), vector_lib::event::Value::Bytes("empty".into())); + empty_tuple_map.insert( + "f1".into(), + vector_lib::event::Value::Object(empty_map.clone()), + ); + event2.insert( + "tuple_with_map", + vector_lib::event::Value::Object(empty_tuple_map), + ); + + let mut empty_tuple_complex = vector_lib::event::ObjectMap::new(); + empty_tuple_complex.insert("f0".into(), vector_lib::event::Value::Bytes("empty".into())); + empty_tuple_complex.insert("f1".into(), vector_lib::event::Value::Array(vec![])); + empty_tuple_complex.insert( + "f2".into(), + vector_lib::event::Value::Object(empty_map.clone()), + ); + event2.insert( + "tuple_with_nested", + vector_lib::event::Value::Object(empty_tuple_complex), + ); + + event2.insert("locations", vector_lib::event::Value::Array(vec![])); + event2.insert("tags_history", vector_lib::event::Value::Array(vec![])); + event2.insert("metrics_history", vector_lib::event::Value::Array(vec![])); + event2.insert( + "request_headers", + vector_lib::event::Value::Object(empty_map.clone()), + ); + + let mut empty_metrics = vector_lib::event::ObjectMap::new(); + empty_metrics.insert("f0".into(), vector_lib::event::Value::Integer(0)); + empty_metrics.insert("f1".into(), vector_lib::event::Value::Integer(0)); + empty_metrics.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(0.0).unwrap()), + ); + event2.insert( + "response_metrics", + vector_lib::event::Value::Object(empty_metrics), + ); + + event2.insert("tags", vector_lib::event::Value::Array(vec![])); + event2.insert( + "user_properties", + vector_lib::event::Value::Object(empty_map), + ); + event2.insert("array_with_nulls", vector_lib::event::Value::Array(vec![])); + + events.push(event2.into()); + + // Event 3: More varied data + let mut event3 = LogEvent::from("Test varied data"); + event3.insert("host", "host3.example.com"); + + event3.insert( + "nested_int_array", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Array(vec![]), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(99)]), + ]), + ); + event3.insert( + "nested_string_array", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Bytes("test".into()), + ])]), + ); + + let mut map3 = vector_lib::event::ObjectMap::new(); + map3.insert( + "colors".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("red".into())]), + ); + event3.insert("array_map", vector_lib::event::Value::Object(map3)); + + let mut int_map3 = vector_lib::event::ObjectMap::new(); + int_map3.insert( + "values".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(42)]), + ); + event3.insert("int_array_map", vector_lib::event::Value::Object(int_map3)); + + let mut tuple3 = vector_lib::event::ObjectMap::new(); + tuple3.insert("f0".into(), vector_lib::event::Value::Bytes("data".into())); + tuple3.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(5)]), + ); + event3.insert("tuple_with_array", vector_lib::event::Value::Object(tuple3)); + + let mut map_inner = vector_lib::event::ObjectMap::new(); + map_inner.insert( + "val".into(), + vector_lib::event::Value::Float(NotNan::new(1.0).unwrap()), + ); + let mut tuple_map3 = vector_lib::event::ObjectMap::new(); + tuple_map3.insert("f0".into(), vector_lib::event::Value::Bytes("test".into())); + tuple_map3.insert("f1".into(), vector_lib::event::Value::Object(map_inner)); + event3.insert( + "tuple_with_map", + vector_lib::event::Value::Object(tuple_map3), + ); + + let mut map_inner2 = vector_lib::event::ObjectMap::new(); + map_inner2.insert( + "x".into(), + vector_lib::event::Value::Float(NotNan::new(2.0).unwrap()), + ); + let mut tuple_nested3 = vector_lib::event::ObjectMap::new(); + tuple_nested3.insert("f0".into(), vector_lib::event::Value::Bytes("nest".into())); + tuple_nested3.insert( + "f1".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(1)]), + ); + tuple_nested3.insert("f2".into(), vector_lib::event::Value::Object(map_inner2)); + event3.insert( + "tuple_with_nested", + vector_lib::event::Value::Object(tuple_nested3), + ); + + let mut loc3 = vector_lib::event::ObjectMap::new(); + loc3.insert("f0".into(), vector_lib::event::Value::Bytes("NYC".into())); + loc3.insert( + "f1".into(), + vector_lib::event::Value::Float(NotNan::new(40.7128).unwrap()), + ); + loc3.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(-74.0060).unwrap()), + ); + event3.insert( + "locations", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(loc3)]), + ); + + let mut tags3 = vector_lib::event::ObjectMap::new(); + tags3.insert("env".into(), vector_lib::event::Value::Bytes("dev".into())); + event3.insert( + "tags_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(tags3)]), + ); + + let mut metrics3 = vector_lib::event::ObjectMap::new(); + metrics3.insert("cpu".into(), vector_lib::event::Value::Integer(60)); + event3.insert( + "metrics_history", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(metrics3)]), + ); + + let mut headers3 = vector_lib::event::ObjectMap::new(); + headers3.insert( + "content-type".into(), + vector_lib::event::Value::Bytes("application/json".into()), + ); + event3.insert( + "request_headers", + vector_lib::event::Value::Object(headers3), + ); + + let mut metrics3_resp = vector_lib::event::ObjectMap::new(); + metrics3_resp.insert("f0".into(), vector_lib::event::Value::Integer(404)); + metrics3_resp.insert("f1".into(), vector_lib::event::Value::Integer(0)); + metrics3_resp.insert( + "f2".into(), + vector_lib::event::Value::Float(NotNan::new(0.001).unwrap()), + ); + event3.insert( + "response_metrics", + vector_lib::event::Value::Object(metrics3_resp), + ); + + event3.insert( + "tags", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("test".into())]), + ); + + let mut user_props3 = vector_lib::event::ObjectMap::new(); + user_props3.insert( + "permissions".into(), + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Bytes("read".into())]), + ); + event3.insert( + "user_properties", + vector_lib::event::Value::Object(user_props3), + ); + + event3.insert( + "array_with_nulls", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(42)]), + ); + + events.push(event3.into()); + + run_and_assert_sink_compliance(sink, stream::iter(events), &SINK_TAGS).await; + + let output = client.select_all(&table).await; + assert_eq!(3, output.rows); + + // Verify event 1 - comprehensive data + let row1 = &output.data[0]; + assert!( + row1.get("nested_int_array") + .and_then(|v| v.as_array()) + .is_some() + ); + assert!(row1.get("array_map").and_then(|v| v.as_object()).is_some()); + // Tuples are returned as arrays from ClickHouse + assert!( + row1.get("tuple_with_array") + .and_then(|v| v.as_array()) + .is_some() + ); + assert!(row1.get("locations").and_then(|v| v.as_array()).is_some()); + assert!( + row1.get("tags_history") + .and_then(|v| v.as_array()) + .is_some() + ); + assert!( + row1.get("request_headers") + .and_then(|v| v.as_object()) + .is_some() + ); + assert!( + row1.get("array_with_nulls") + .and_then(|v| v.as_array()) + .is_some() + ); + + // Verify event 2 - empty collections + let row2 = &output.data[1]; + let empty_nested = row2 + .get("nested_int_array") + .and_then(|v| v.as_array()) + .unwrap(); + assert_eq!(0, empty_nested.len()); + let empty_tags = row2.get("tags").and_then(|v| v.as_array()).unwrap(); + assert_eq!(0, empty_tags.len()); + + // Verify event 3 - varied data + let row3 = &output.data[2]; + let nested3 = row3 + .get("nested_int_array") + .and_then(|v| v.as_array()) + .unwrap(); + assert_eq!(2, nested3.len()); +} From d35116c6916548193bee1346c2ae1c3364235fa5 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Tue, 23 Dec 2025 22:43:20 +0800 Subject: [PATCH 03/11] refactor: use idiomatic Arrow types instead of Arc --- .../src/encoding/format/arrow/builder.rs | 5 +- lib/codecs/src/encoding/format/arrow/mod.rs | 16 +-- lib/codecs/src/encoding/format/arrow/tests.rs | 112 ++++++++---------- .../encoding/format/arrow/types/complex.rs | 3 +- 4 files changed, 61 insertions(+), 75 deletions(-) diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs index 1c7e7a613156d..715c09f2a9c60 100644 --- a/lib/codecs/src/encoding/format/arrow/builder.rs +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -1,9 +1,8 @@ use arrow::{ array::ArrayRef, - datatypes::{DataType, Schema}, + datatypes::{DataType, SchemaRef}, record_batch::RecordBatch, }; -use std::sync::Arc; use vector_core::event::Event; use crate::encoding::format::arrow::{ @@ -19,7 +18,7 @@ use crate::encoding::format::arrow::{ /// Builds an Arrow RecordBatch from events pub(crate) fn build_record_batch( - schema: Arc, + schema: SchemaRef, events: &[Event], ) -> Result { let num_fields = schema.fields().len(); diff --git a/lib/codecs/src/encoding/format/arrow/mod.rs b/lib/codecs/src/encoding/format/arrow/mod.rs index c87ad4226043e..c21c65a68bdb5 100644 --- a/lib/codecs/src/encoding/format/arrow/mod.rs +++ b/lib/codecs/src/encoding/format/arrow/mod.rs @@ -11,7 +11,7 @@ mod types; mod tests; use arrow::{ - datatypes::{DataType, Schema}, + datatypes::{DataType, FieldRef, Schema, SchemaRef}, ipc::writer::StreamWriter, }; use async_trait::async_trait; @@ -94,7 +94,7 @@ impl ArrowStreamSerializerConfig { /// Arrow IPC stream batch serializer that holds the schema #[derive(Clone, Debug)] pub struct ArrowStreamSerializer { - schema: Arc, + schema: SchemaRef, } impl ArrowStreamSerializer { @@ -111,8 +111,8 @@ impl ArrowStreamSerializer { schema .fields() .iter() - .map(|f| Arc::new(make_field_nullable(f))) - .collect::>(), + .map(|f| make_field_nullable(f).into()) + .collect::>(), schema.metadata().clone(), ) } else { @@ -120,7 +120,7 @@ impl ArrowStreamSerializer { }; Ok(Self { - schema: Arc::new(schema), + schema: SchemaRef::new(schema), }) } } @@ -213,7 +213,7 @@ impl From for ArrowEncodingError { /// Encodes a batch of events into Arrow IPC streaming format pub fn encode_events_to_arrow_ipc_stream( events: &[vector_core::event::Event], - schema: Option>, + schema: Option, ) -> Result { if events.is_empty() { return Err(ArrowEncodingError::NoEvents); @@ -237,7 +237,7 @@ pub fn encode_events_to_arrow_ipc_stream( /// Recursively makes a Field and all its nested fields nullable fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Field { let new_data_type = match field.data_type() { - DataType::List(inner_field) => DataType::List(Arc::new(make_field_nullable(inner_field))), + DataType::List(inner_field) => DataType::List(make_field_nullable(inner_field).into()), DataType::Struct(fields) => { DataType::Struct(fields.iter().map(|f| make_field_nullable(f)).collect()) } @@ -258,7 +258,7 @@ fn make_field_nullable(field: &arrow::datatypes::Field) -> arrow::datatypes::Fie .with_data_type(DataType::Struct(new_struct_fields.into())) .with_nullable(false); - DataType::Map(Arc::new(new_inner_field), *sorted) + DataType::Map(new_inner_field.into(), *sorted) } other => other.clone(), }; diff --git a/lib/codecs/src/encoding/format/arrow/tests.rs b/lib/codecs/src/encoding/format/arrow/tests.rs index 1a05ed19eb0a2..a3374e968a50c 100644 --- a/lib/codecs/src/encoding/format/arrow/tests.rs +++ b/lib/codecs/src/encoding/format/arrow/tests.rs @@ -5,7 +5,7 @@ use arrow::{ StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }, - datatypes::{DataType, Field, Fields, Schema, TimeUnit}, + datatypes::{DataType, Field, Fields, Schema, SchemaRef, TimeUnit}, ipc::reader::StreamReader, }; use chrono::Utc; @@ -77,7 +77,7 @@ fn test_encode_all_types() { false, ); - let schema = Arc::new(Schema::new(vec![ + let schema = SchemaRef::new(Schema::new(vec![ Field::new("string_field", DataType::Utf8, true), Field::new("int8_field", DataType::Int8, true), Field::new("int16_field", DataType::Int16, true), @@ -99,15 +99,11 @@ fn test_encode_all_types() { Field::new("decimal_field", DataType::Decimal128(10, 2), true), Field::new( "list_field", - DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + DataType::List(Field::new("item", DataType::Int64, true).into()), true, ), Field::new("struct_field", DataType::Struct(struct_fields), true), - Field::new( - "map_field", - DataType::Map(Arc::new(map_entries), false), - true, - ), + Field::new("map_field", DataType::Map(map_entries.into(), false), true), ])); let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); @@ -315,7 +311,7 @@ fn test_encode_null_values() { let events = vec![Event::Log(log1), Event::Log(log2)]; - let schema = Arc::new(Schema::new(vec![ + let schema = SchemaRef::new(Schema::new(vec![ Field::new("field_a", DataType::Int64, true), Field::new("field_b", DataType::Int64, true), ])); @@ -358,7 +354,7 @@ fn test_encode_type_mismatches() { let events = vec![Event::Log(log1), Event::Log(log2)]; // Schema expects Int64 - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "field", DataType::Int64, true, @@ -396,7 +392,7 @@ fn test_encode_complex_json_values() { let events = vec![Event::Log(log)]; - let schema = Arc::new(Schema::new(vec![ + let schema = SchemaRef::new(Schema::new(vec![ Field::new("object_field", DataType::Utf8, true), Field::new("array_field", DataType::Utf8, true), ])); @@ -437,7 +433,7 @@ fn test_encode_unsupported_type() { let events = vec![Event::Log(log)]; // Use an unsupported type - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "field", DataType::Duration(TimeUnit::Millisecond), true, @@ -485,7 +481,7 @@ fn test_encode_timestamp_precisions() { let events = vec![Event::Log(log)]; - let schema = Arc::new(Schema::new(vec![ + let schema = SchemaRef::new(Schema::new(vec![ Field::new( "ts_second", DataType::Timestamp(TimeUnit::Second, None), @@ -566,7 +562,7 @@ fn test_encode_mixed_timestamp_string_and_native() { let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true, @@ -618,7 +614,7 @@ fn test_encode_invalid_string_timestamp() { let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "timestamp", DataType::Timestamp(TimeUnit::Nanosecond, None), true, @@ -657,7 +653,7 @@ fn test_encode_decimal128_from_integer() { let events = vec![Event::Log(log)]; // Decimal(10, 3) - will represent 1000 as 1000.000 - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "quantity", DataType::Decimal128(10, 3), true, @@ -695,7 +691,7 @@ fn test_encode_decimal256() { let events = vec![Event::Log(log)]; // Decimal256(50, 6) - high precision decimal - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "big_value", DataType::Decimal256(50, 6), true, @@ -738,7 +734,7 @@ fn test_encode_decimal_null_values() { let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "price", DataType::Decimal128(10, 2), true, @@ -789,7 +785,7 @@ fn test_encode_unsigned_integers_with_null_and_overflow() { let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let schema = Arc::new(Schema::new(vec![ + let schema = SchemaRef::new(Schema::new(vec![ Field::new("uint8_field", DataType::UInt8, true), Field::new("uint32_field", DataType::UInt32, true), ])); @@ -837,7 +833,7 @@ fn test_encode_non_nullable_field_with_null_value() { let events = vec![Event::Log(log1), Event::Log(log2)]; // Create schema with non-nullable field - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "required_field", DataType::Int64, false, // Not nullable @@ -868,7 +864,7 @@ fn test_encode_non_nullable_field_all_values_present() { let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "id", DataType::Int64, false, // Not nullable @@ -956,7 +952,7 @@ fn test_make_field_nullable_with_nested_types() { let inner_struct_field = Field::new("nested_field", DataType::Int64, false); let inner_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![inner_struct_field])); let list_field = Field::new("item", inner_struct, false); - let list_type = DataType::List(Arc::new(list_field)); + let list_type = DataType::List(list_field.into()); let outer_field = Field::new("inner_list", list_type, false); let outer_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![outer_field])); @@ -1016,7 +1012,7 @@ fn test_make_field_nullable_with_map_type() { let entries_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![key_field, value_field])); let entries_field = Field::new("entries", entries_struct, false); - let map_type = DataType::Map(Arc::new(entries_field), false); + let map_type = DataType::Map(entries_field.into(), false); let original_field = Field::new("my_map", map_type, false); @@ -1092,7 +1088,7 @@ fn test_encode_nested_maps() { ])), false, ); - let inner_map_type = DataType::Map(Arc::new(inner_map_entries), false); + let inner_map_type = DataType::Map(inner_map_entries.into(), false); let outer_map_entries = Field::new( "entries", @@ -1102,9 +1098,9 @@ fn test_encode_nested_maps() { ])), false, ); - let outer_map_type = DataType::Map(Arc::new(outer_map_entries), false); + let outer_map_type = DataType::Map(outer_map_entries.into(), false); - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "nested_map", outer_map_type, true, @@ -1190,12 +1186,12 @@ fn test_encode_array_of_maps() { ])), false, ); - let map_type = DataType::Map(Arc::new(map_entries), false); + let map_type = DataType::Map(map_entries.into(), false); let list_field = Field::new("item", map_type, true); - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "array_of_maps", - DataType::List(Arc::new(list_field)), + DataType::List(list_field.into()), true, )])); @@ -1269,9 +1265,9 @@ fn test_encode_array_of_structs() { let struct_type = DataType::Struct(struct_fields); let list_field = Field::new("item", struct_type, true); - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "array_of_structs", - DataType::List(Arc::new(list_field)), + DataType::List(list_field.into()), true, )])); @@ -1356,22 +1352,14 @@ fn test_encode_empty_arrays_and_maps() { false, ); - let schema = Arc::new(Schema::new(vec![ + let schema = SchemaRef::new(Schema::new(vec![ Field::new( "empty_array", - DataType::List(Arc::new(array_field.clone())), - true, - ), - Field::new( - "empty_map", - DataType::Map(Arc::new(map_entries), false), - true, - ), - Field::new( - "non_empty_array", - DataType::List(Arc::new(array_field)), + DataType::List(array_field.clone().into()), true, ), + Field::new("empty_map", DataType::Map(map_entries.into(), false), true), + Field::new("non_empty_array", DataType::List(array_field.into()), true), ])); let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); @@ -1433,10 +1421,10 @@ fn test_encode_deep_nesting() { // Define schema for deep array nesting (6 levels total) let mut current_field = Field::new("item", DataType::Int32, true); for _ in 0..5 { - current_field = Field::new("item", DataType::List(Arc::new(current_field)), true); + current_field = Field::new("item", DataType::List(current_field.into()), true); } - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "deep_array", current_field.data_type().clone(), true, @@ -1532,13 +1520,13 @@ fn test_encode_struct_with_list_and_map() { let struct_fields = arrow::datatypes::Fields::from(vec![ Field::new( "f0", - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(Field::new("item", DataType::Int32, true).into()), true, ), - Field::new("f1", DataType::Map(Arc::new(map_entries), false), true), + Field::new("f1", DataType::Map(map_entries.into(), false), true), ]); - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "complex_struct", DataType::Struct(struct_fields), true, @@ -1635,9 +1623,9 @@ fn test_encode_map_with_struct_values() { false, ); - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "map_with_structs", - DataType::Map(Arc::new(map_entries), false), + DataType::Map(map_entries.into(), false), true, )])); @@ -1738,14 +1726,14 @@ fn test_encode_list_of_structs_containing_maps() { let struct_fields = arrow::datatypes::Fields::from(vec![ Field::new("f0", DataType::Int32, true), - Field::new("f1", DataType::Map(Arc::new(map_entries), false), true), + Field::new("f1", DataType::Map(map_entries.into(), false), true), ]); let list_field = Field::new("item", DataType::Struct(struct_fields), true); - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "list_of_structs_with_maps", - DataType::List(Arc::new(list_field)), + DataType::List(list_field.into()), true, )])); @@ -1849,12 +1837,12 @@ fn test_encode_deeply_nested_mixed_types() { let inner_struct_fields = arrow::datatypes::Fields::from(vec![ Field::new( "f0", - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(Field::new("item", DataType::Int32, true).into()), true, ), Field::new( "f1", - DataType::Map(Arc::new(metadata_map_entries), false), + DataType::Map(metadata_map_entries.into(), false), true, ), ]); @@ -1868,15 +1856,15 @@ fn test_encode_deeply_nested_mixed_types() { false, ); - let list_field = Field::new("item", DataType::Map(Arc::new(map_entries), false), true); + let list_field = Field::new("item", DataType::Map(map_entries.into(), false), true); let outer_struct_fields = arrow::datatypes::Fields::from(vec![Field::new( "f0", - DataType::List(Arc::new(list_field)), + DataType::List(list_field.into()), true, )]); - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = SchemaRef::new(Schema::new(vec![Field::new( "deeply_nested", DataType::Struct(outer_struct_fields), true, @@ -1975,7 +1963,7 @@ fn test_automatic_json_serialization_for_array_of_objects() { // The encoder should automatically serialize objects to JSON strings let schema = Schema::new(vec![Field::new( "components", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Field::new("item", DataType::Utf8, true).into()), false, )]); @@ -2042,7 +2030,7 @@ fn test_object_in_map_values_to_string() { let value_field = Field::new("values", DataType::Utf8, true); let entries_struct = DataType::Struct(Fields::from(vec![key_field, value_field])); let entries_field = Field::new("entries", entries_struct, false); - let map_type = DataType::Map(Arc::new(entries_field), false); + let map_type = DataType::Map(entries_field.into(), false); let schema = Schema::new(vec![Field::new("settings", map_type, false)]); @@ -2107,8 +2095,8 @@ fn test_nested_arrays_with_objects() { // Schema: Array(Array(String)) let inner_field = Field::new("item", DataType::Utf8, true); - let middle_field = Field::new("item", DataType::List(Arc::new(inner_field)), true); - let outer_list = DataType::List(Arc::new(middle_field)); + let middle_field = Field::new("item", DataType::List(inner_field.into()), true); + let outer_list = DataType::List(middle_field.into()); let schema = Schema::new(vec![Field::new("nested", outer_list, false)]); diff --git a/lib/codecs/src/encoding/format/arrow/types/complex.rs b/lib/codecs/src/encoding/format/arrow/types/complex.rs index 8a3e62ad41ff7..9ffd95f175742 100644 --- a/lib/codecs/src/encoding/format/arrow/types/complex.rs +++ b/lib/codecs/src/encoding/format/arrow/types/complex.rs @@ -408,7 +408,6 @@ mod tests { Array, Int32Array, Int64Array, ListArray, MapArray, StringArray, StructArray, }; use arrow::datatypes::{DataType, Field, Fields}; - use std::sync::Arc; use vector_core::event::{Event, LogEvent, Value}; use vrl::value::ObjectMap; @@ -720,7 +719,7 @@ mod tests { Field::new("f0", DataType::Utf8, true), Field::new( "f1", - DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + DataType::List(Field::new("item", DataType::Int64, true).into()), true, ), ]); From dccf192b3472a11bc95286f7e738091ba81783bb Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 00:30:52 +0800 Subject: [PATCH 04/11] refactor: unify primitives, decimal, temporal + complex logic --- .../src/encoding/format/arrow/builder.rs | 472 ++++++++-- .../format/arrow/{types/mod.rs => types.rs} | 39 +- .../encoding/format/arrow/types/complex.rs | 815 ------------------ .../encoding/format/arrow/types/decimal.rs | 116 --- .../encoding/format/arrow/types/primitives.rs | 187 ---- .../encoding/format/arrow/types/temporal.rs | 85 -- 6 files changed, 431 insertions(+), 1283 deletions(-) rename lib/codecs/src/encoding/format/arrow/{types/mod.rs => types.rs} (66%) delete mode 100644 lib/codecs/src/encoding/format/arrow/types/complex.rs delete mode 100644 lib/codecs/src/encoding/format/arrow/types/decimal.rs delete mode 100644 lib/codecs/src/encoding/format/arrow/types/primitives.rs delete mode 100644 lib/codecs/src/encoding/format/arrow/types/temporal.rs diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs index 715c09f2a9c60..157fbb6b5745c 100644 --- a/lib/codecs/src/encoding/format/arrow/builder.rs +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -1,70 +1,438 @@ +//! Arrow record batch builder +//! +//! Builds Arrow RecordBatches from Vector events by creating appropriate +//! array builders and appending values according to the schema. + use arrow::{ - array::ArrayRef, - datatypes::{DataType, SchemaRef}, - record_batch::RecordBatch, -}; -use vector_core::event::Event; - -use crate::encoding::format::arrow::{ - ArrowEncodingError, - types::{ - build_binary_array, build_boolean_array, build_decimal128_array, build_decimal256_array, - build_float32_array, build_float64_array, build_int8_array, build_int16_array, - build_int32_array, build_int64_array, build_list_array, build_map_array, - build_string_array, build_struct_array, build_timestamp_array, build_uint8_array, - build_uint16_array, build_uint32_array, build_uint64_array, + array::{ + ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Decimal128Builder, + Decimal256Builder, Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, + Int64Builder, ListBuilder, MapBuilder, StringBuilder, StructBuilder, + TimestampMicrosecondBuilder, TimestampMillisecondBuilder, TimestampNanosecondBuilder, + TimestampSecondBuilder, UInt8Builder, UInt16Builder, UInt32Builder, UInt64Builder, }, + datatypes::{DataType, Field, SchemaRef, TimeUnit, i256}, + record_batch::RecordBatch, }; +use vector_core::event::{Event, Value}; -/// Builds an Arrow RecordBatch from events -pub(crate) fn build_record_batch( - schema: SchemaRef, - events: &[Event], -) -> Result { - let num_fields = schema.fields().len(); - let mut columns: Vec = Vec::with_capacity(num_fields); +use super::{ArrowEncodingError, types::create_array_builder_for_type}; - for field in schema.fields() { - let field_name = field.name(); - let nullable = field.is_nullable(); - let array: ArrayRef = match field.data_type() { - DataType::Timestamp(time_unit, _) => { - build_timestamp_array(events, field_name, *time_unit, nullable)? +/// Checks if a data type is supported by the Arrow encoder. +fn is_supported_type(data_type: &DataType) -> bool { + matches!( + data_type, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + | DataType::Boolean + | DataType::Utf8 + | DataType::Binary + | DataType::Timestamp(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) + | DataType::List(_) + | DataType::Struct(_) + | DataType::Map(_, _) + ) +} + +/// Helper macro for downcasting builders +macro_rules! downcast_builder { + // Infallible version - used for non-complex types + ($builder:expr, $builder_type:ty) => { + $builder + .as_any_mut() + .downcast_mut::<$builder_type>() + .expect(concat!( + "Failed to downcast builder to ", + stringify!($builder_type) + )) + }; + + // Fallible version - used for complex types (returns Result for error handling) + ($builder:expr, $builder_type:ty, $field:expr) => { + $builder + .as_any_mut() + .downcast_mut::<$builder_type>() + .ok_or_else(|| ArrowEncodingError::UnsupportedType { + field_name: $field.name().clone(), + data_type: $field.data_type().clone(), + }) + }; +} + +/// Macro to simplify appending null values by generating match arms +macro_rules! append_null_match { + ($builder:expr, $data_type:expr, {$($pattern:pat => $builder_type:ty),* $(,)?}) => { + match $data_type { + $($pattern => downcast_builder!($builder, $builder_type).append_null(),)* + _ => {} + } + }; +} + +/// Macro to simplify integer/float appending with bounds checking and casting. +macro_rules! append_primitive { + // Simple case: no bounds checking + ($builder:expr, $builder_type:ty, $val:expr, $cast_type:ty) => {{ + downcast_builder!($builder, $builder_type).append_value(*$val as $cast_type); + }}; + // With bounds checking + ($builder:expr, $builder_type:ty, $val:expr, $cast_type:ty, $min:expr, $max:expr) => {{ + if *$val >= $min as i64 && *$val <= $max as i64 { + downcast_builder!($builder, $builder_type).append_value(*$val as $cast_type); + } else { + downcast_builder!($builder, $builder_type).append_null(); + } + }}; +} + +/// Helper function to serialize a Value to JSON string. +/// This is used when the schema expects a string but the data contains complex types. +fn value_to_json_string(value: &Value) -> Result { + serde_json::to_string(value).map_err(|e| ArrowEncodingError::Io { + source: std::io::Error::new(std::io::ErrorKind::InvalidData, e), + }) +} + +/// Appends a null value to an array builder based on its type. +fn append_null_to_builder( + builder: &mut dyn ArrayBuilder, + data_type: &DataType, +) -> Result<(), ArrowEncodingError> { + append_null_match!(builder, data_type, { + DataType::Int8 => Int8Builder, + DataType::Int16 => Int16Builder, + DataType::Int32 => Int32Builder, + DataType::Int64 => Int64Builder, + DataType::UInt8 => UInt8Builder, + DataType::UInt16 => UInt16Builder, + DataType::UInt32 => UInt32Builder, + DataType::UInt64 => UInt64Builder, + DataType::Float32 => Float32Builder, + DataType::Float64 => Float64Builder, + DataType::Boolean => BooleanBuilder, + DataType::Utf8 => StringBuilder, + DataType::Binary => BinaryBuilder, + DataType::Timestamp(TimeUnit::Second, _) => TimestampSecondBuilder, + DataType::Timestamp(TimeUnit::Millisecond, _) => TimestampMillisecondBuilder, + DataType::Timestamp(TimeUnit::Microsecond, _) => TimestampMicrosecondBuilder, + DataType::Timestamp(TimeUnit::Nanosecond, _) => TimestampNanosecondBuilder, + DataType::Decimal128(_, _) => Decimal128Builder, + DataType::Decimal256(_, _) => Decimal256Builder, + DataType::List(_) => ListBuilder>, + DataType::Struct(_) => StructBuilder, + }); + + // Special case: Map uses append(false) instead of append_null() + if matches!(data_type, DataType::Map(_, _)) { + downcast_builder!(builder, MapBuilder>) + .append(false) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + + Ok(()) +} + +/// Recursively appends a VRL Value to an Arrow array builder. +fn append_value_to_builder( + builder: &mut dyn ArrayBuilder, + value: &Value, + field: &Field, +) -> Result<(), ArrowEncodingError> { + match (field.data_type(), value) { + // Integer types with range checking + (DataType::Int8, Value::Integer(i)) => { + append_primitive!(builder, Int8Builder, i, i8, i8::MIN, i8::MAX) + } + (DataType::Int16, Value::Integer(i)) => { + append_primitive!(builder, Int16Builder, i, i16, i16::MIN, i16::MAX) + } + (DataType::Int32, Value::Integer(i)) => { + append_primitive!(builder, Int32Builder, i, i32, i32::MIN, i32::MAX) + } + (DataType::Int64, Value::Integer(i)) => append_primitive!(builder, Int64Builder, i, i64), + + // Unsigned integer types with range checking + (DataType::UInt8, Value::Integer(i)) => { + append_primitive!(builder, UInt8Builder, i, u8, 0, u8::MAX) + } + (DataType::UInt16, Value::Integer(i)) => { + append_primitive!(builder, UInt16Builder, i, u16, 0, u16::MAX) + } + (DataType::UInt32, Value::Integer(i)) => { + append_primitive!(builder, UInt32Builder, i, u32, 0, u32::MAX) + } + (DataType::UInt64, Value::Integer(i)) => { + if *i >= 0 { + append_primitive!(builder, UInt64Builder, i, u64); + } else { + downcast_builder!(builder, UInt64Builder).append_null(); + } + } + + // Float types + (DataType::Float32, Value::Float(f)) => { + let val = f.into_inner(); + downcast_builder!(builder, Float32Builder).append_value(val as f32); + } + (DataType::Float32, Value::Integer(i)) => { + append_primitive!(builder, Float32Builder, i, f32) + } + (DataType::Float64, Value::Float(f)) => { + let val = f.into_inner(); + downcast_builder!(builder, Float64Builder).append_value(val); + } + (DataType::Float64, Value::Integer(i)) => { + append_primitive!(builder, Float64Builder, i, f64) + } + + // Boolean + (DataType::Boolean, Value::Boolean(b)) => { + downcast_builder!(builder, BooleanBuilder).append_value(*b); + } + // String types + (DataType::Utf8, Value::Bytes(bytes)) => match std::str::from_utf8(bytes) { + Ok(s) => downcast_builder!(builder, StringBuilder).append_value(s), + Err(_) => { + let s = String::from_utf8_lossy(bytes); + downcast_builder!(builder, StringBuilder).append_value(&s) + } + }, + // Object -> String + (DataType::Utf8, Value::Object(obj)) => { + let json_str = value_to_json_string(&Value::Object(obj.clone()))?; + downcast_builder!(builder, StringBuilder).append_value(&json_str); + } + // Array -> String + (DataType::Utf8, Value::Array(arr)) => { + let json_str = value_to_json_string(&Value::Array(arr.clone()))?; + downcast_builder!(builder, StringBuilder).append_value(&json_str); + } + (DataType::Binary, Value::Bytes(bytes)) => { + downcast_builder!(builder, BinaryBuilder).append_value(bytes); + } + + // Timestamp types + (DataType::Timestamp(time_unit, _), value) => { + use chrono::Utc; + + let timestamp_value = match value { + Value::Timestamp(ts) => Some(*ts), + Value::Bytes(bytes) => std::str::from_utf8(bytes) + .ok() + .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok()) + .map(|dt| dt.with_timezone(&Utc)), + _ => None, + }; + + let converted_value = match (time_unit, timestamp_value) { + (TimeUnit::Second, Some(ts)) => Some(ts.timestamp()), + (TimeUnit::Millisecond, Some(ts)) => Some(ts.timestamp_millis()), + (TimeUnit::Microsecond, Some(ts)) => Some(ts.timestamp_micros()), + (TimeUnit::Nanosecond, Some(ts)) => ts.timestamp_nanos_opt(), + _ => { + // Fallback to raw integer if not a timestamp + if let Value::Integer(i) = value { + Some(*i) + } else { + None + } + } + }; + + match (time_unit, converted_value) { + (TimeUnit::Second, Some(val)) => { + downcast_builder!(builder, TimestampSecondBuilder).append_value(val); + } + (TimeUnit::Millisecond, Some(val)) => { + downcast_builder!(builder, TimestampMillisecondBuilder).append_value(val); + } + (TimeUnit::Microsecond, Some(val)) => { + downcast_builder!(builder, TimestampMicrosecondBuilder).append_value(val); + } + (TimeUnit::Nanosecond, Some(val)) => { + downcast_builder!(builder, TimestampNanosecondBuilder).append_value(val); + } + (TimeUnit::Second, None) => { + downcast_builder!(builder, TimestampSecondBuilder).append_null(); + } + (TimeUnit::Millisecond, None) => { + downcast_builder!(builder, TimestampMillisecondBuilder).append_null(); + } + (TimeUnit::Microsecond, None) => { + downcast_builder!(builder, TimestampMicrosecondBuilder).append_null(); + } + (TimeUnit::Nanosecond, None) => { + downcast_builder!(builder, TimestampNanosecondBuilder).append_null(); + } + } + } + + // Decimal types + (DataType::Decimal128(_precision, scale), value) => { + use rust_decimal::Decimal; + + let decimal_builder = builder + .as_any_mut() + .downcast_mut::() + .expect("Failed to downcast to Decimal128Builder"); + + let target_scale = scale.unsigned_abs() as u32; + + match value { + Value::Float(f) => { + if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { + decimal.rescale(target_scale); + decimal_builder.append_value(decimal.mantissa()); + } else { + decimal_builder.append_null(); + } + } + Value::Integer(i) => { + let mut decimal = Decimal::from(*i); + decimal.rescale(target_scale); + decimal_builder.append_value(decimal.mantissa()); + } + _ => decimal_builder.append_null(), } - DataType::Utf8 => build_string_array(events, field_name, nullable)?, - DataType::Int8 => build_int8_array(events, field_name, nullable)?, - DataType::Int16 => build_int16_array(events, field_name, nullable)?, - DataType::Int32 => build_int32_array(events, field_name, nullable)?, - DataType::Int64 => build_int64_array(events, field_name, nullable)?, - DataType::UInt8 => build_uint8_array(events, field_name, nullable)?, - DataType::UInt16 => build_uint16_array(events, field_name, nullable)?, - DataType::UInt32 => build_uint32_array(events, field_name, nullable)?, - DataType::UInt64 => build_uint64_array(events, field_name, nullable)?, - DataType::Float32 => build_float32_array(events, field_name, nullable)?, - DataType::Float64 => build_float64_array(events, field_name, nullable)?, - DataType::Boolean => build_boolean_array(events, field_name, nullable)?, - DataType::Binary => build_binary_array(events, field_name, nullable)?, - DataType::Decimal128(precision, scale) => { - build_decimal128_array(events, field_name, *precision, *scale, nullable)? + } + + (DataType::Decimal256(_precision, scale), value) => { + use rust_decimal::Decimal; + + let decimal_builder = builder + .as_any_mut() + .downcast_mut::() + .expect("Failed to downcast to Decimal256Builder"); + + let target_scale = scale.unsigned_abs() as u32; + + match value { + Value::Float(f) => { + if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { + decimal.rescale(target_scale); + decimal_builder.append_value(i256::from_i128(decimal.mantissa())); + } else { + decimal_builder.append_null(); + } + } + Value::Integer(i) => { + let mut decimal = Decimal::from(*i); + decimal.rescale(target_scale); + decimal_builder.append_value(i256::from_i128(decimal.mantissa())); + } + _ => decimal_builder.append_null(), } - DataType::Decimal256(precision, scale) => { - build_decimal256_array(events, field_name, *precision, *scale, nullable)? + } + + // Complex types + (DataType::List(inner_field), Value::Array(arr)) => { + let list_builder = + downcast_builder!(builder, ListBuilder>, field)?; + + for item in arr.iter() { + append_value_to_builder(list_builder.values(), item, inner_field)?; } - DataType::List(inner_field) => { - build_list_array(events, field_name, inner_field, nullable)? + list_builder.append(true); + } + + (DataType::Struct(fields), Value::Object(obj)) => { + let struct_builder = downcast_builder!(builder, StructBuilder, field)?; + + for (i, field) in fields.iter().enumerate() { + let key = format!("f{}", i); + let field_builder = &mut struct_builder.field_builders_mut()[i]; + match obj.get(key.as_str()) { + Some(val) => append_value_to_builder(field_builder.as_mut(), val, field)?, + None => append_null_to_builder(field_builder.as_mut(), field.data_type())?, + } } - DataType::Struct(fields) => build_struct_array(events, field_name, fields, nullable)?, - DataType::Map(entries_field, _) => { - build_map_array(events, field_name, entries_field, nullable)? + struct_builder.append(true); + } + + (DataType::Map(entries_field, _), Value::Object(obj)) => { + let map_builder = downcast_builder!(builder, MapBuilder>, field)?; + + let DataType::Struct(entries_struct) = entries_field.data_type() else { + return Err(ArrowEncodingError::UnsupportedType { + field_name: field.name().clone(), + data_type: field.data_type().clone(), + }); + }; + + let value_field = &entries_struct[1]; + for (key, value) in obj.iter() { + map_builder.keys().append_value(key.as_ref()); + append_value_to_builder(map_builder.values(), value, value_field)?; } - other_type => { + map_builder + .append(true) + .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; + } + + // Unsupported type/value combinations + _ => { + if !is_supported_type(field.data_type()) { return Err(ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: other_type.clone(), + field_name: field.name().clone(), + data_type: field.data_type().clone(), + }); + } + + // Supported type but value is missing/incompatible + if field.is_nullable() { + append_null_to_builder(builder, field.data_type())?; + } else { + return Err(ArrowEncodingError::NullConstraint { + field_name: field.name().clone(), }); } + } + } + Ok(()) +} + +fn build_array_for_field(events: &[Event], field: &Field) -> Result { + let mut builder = create_array_builder_for_type(field.data_type(), events.len())?; + + events.iter().try_for_each(|event| { + let Event::Log(log) = event else { + return Ok(()); }; + match log.get(field.name().as_str()) { + Some(value) => append_value_to_builder(builder.as_mut(), value, field), + None if field.is_nullable() => { + append_null_to_builder(builder.as_mut(), field.data_type()) + } + None => Err(ArrowEncodingError::NullConstraint { + field_name: field.name().clone(), + }), + } + })?; + + Ok(builder.finish()) +} + +/// Builds an Arrow RecordBatch from events +pub(crate) fn build_record_batch( + schema: SchemaRef, + events: &[Event], +) -> Result { + let num_fields = schema.fields().len(); + let mut columns: Vec = Vec::with_capacity(num_fields); + + for field in schema.fields() { + let array = build_array_for_field(events, field)?; columns.push(array); } diff --git a/lib/codecs/src/encoding/format/arrow/types/mod.rs b/lib/codecs/src/encoding/format/arrow/types.rs similarity index 66% rename from lib/codecs/src/encoding/format/arrow/types/mod.rs rename to lib/codecs/src/encoding/format/arrow/types.rs index 625855cf2ad3c..8bc82d9c64e16 100644 --- a/lib/codecs/src/encoding/format/arrow/types/mod.rs +++ b/lib/codecs/src/encoding/format/arrow/types.rs @@ -1,3 +1,8 @@ +//! Arrow type to array builder mapping +//! +//! Creates appropriate Arrow array builders for different data types, +//! with special handling for complex nested types (List, Struct, Map). + use arrow::array::{ ArrayBuilder, ListBuilder, MapBuilder, StringBuilder, StructBuilder, make_builder, }; @@ -5,20 +10,6 @@ use arrow::datatypes::DataType; use super::ArrowEncodingError; -mod complex; -mod decimal; -mod primitives; -mod temporal; - -pub(crate) use complex::{build_list_array, build_map_array, build_struct_array}; -pub(crate) use decimal::{build_decimal128_array, build_decimal256_array}; -pub(crate) use primitives::{ - build_binary_array, build_boolean_array, build_float32_array, build_float64_array, - build_int8_array, build_int16_array, build_int32_array, build_int64_array, build_string_array, - build_uint8_array, build_uint16_array, build_uint32_array, build_uint64_array, -}; -pub(crate) use temporal::build_timestamp_array; - const NESTED_CAPACITY_MULTIPLIER: usize = 4; /// Creates an array builder for a given Arrow data type. @@ -32,15 +23,9 @@ pub(crate) fn create_array_builder_for_type( capacity: usize, ) -> Result, ArrowEncodingError> { match data_type { - DataType::List(inner_field) => { - create_list_builder(inner_field.data_type(), capacity) - } - DataType::Struct(fields) => { - create_struct_builder(fields, capacity) - } - DataType::Map(entries_field, _) => { - create_map_builder(entries_field.data_type(), capacity) - } + DataType::List(inner_field) => create_list_builder(inner_field.data_type(), capacity), + DataType::Struct(fields) => create_struct_builder(fields, capacity), + DataType::Map(entries_field, _) => create_map_builder(entries_field.data_type(), capacity), _ => Ok(make_builder(data_type, capacity)), } } @@ -78,10 +63,8 @@ fn create_map_builder( let nested_capacity = capacity * NESTED_CAPACITY_MULTIPLIER; let key_builder = StringBuilder::with_capacity(nested_capacity, 0); - let value_builder = create_array_builder_for_type( - entries_fields[1].data_type(), - nested_capacity, - )?; - + let value_builder = + create_array_builder_for_type(entries_fields[1].data_type(), nested_capacity)?; + Ok(Box::new(MapBuilder::new(None, key_builder, value_builder))) } diff --git a/lib/codecs/src/encoding/format/arrow/types/complex.rs b/lib/codecs/src/encoding/format/arrow/types/complex.rs deleted file mode 100644 index 9ffd95f175742..0000000000000 --- a/lib/codecs/src/encoding/format/arrow/types/complex.rs +++ /dev/null @@ -1,815 +0,0 @@ -//! Complex type array builders for Arrow encoding -//! -//! This module handles nested Arrow types: List, Struct (tuples), and Map. - -use arrow::array::{ - ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, - Int8Builder, Int16Builder, Int32Builder, Int64Builder, ListBuilder, MapBuilder, StringBuilder, - StructBuilder, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, - TimestampNanosecondBuilder, TimestampSecondBuilder, UInt8Builder, UInt16Builder, UInt32Builder, - UInt64Builder, -}; -use arrow::datatypes::{DataType, Field, Fields, TimeUnit}; -use std::sync::Arc; - -use super::super::ArrowEncodingError; -use super::create_array_builder_for_type; -use vector_core::event::{Event, Value}; - -/// Helper macro for downcasting builders -macro_rules! downcast_builder { - ($builder:expr, $builder_type:ty) => { - $builder - .as_any_mut() - .downcast_mut::<$builder_type>() - .expect(concat!( - "Failed to downcast builder to ", - stringify!($builder_type) - )) - }; -} - -/// Helper function to serialize a Value to JSON string. -/// This is used when the schema expects a string but the data contains complex types. -fn value_to_json_string(value: &Value) -> Result { - serde_json::to_string(value).map_err(|e| ArrowEncodingError::Io { - source: std::io::Error::new(std::io::ErrorKind::InvalidData, e), - }) -} - -/// Appends a null value to an array builder based on the data type. -fn append_null_to_builder( - builder: &mut dyn ArrayBuilder, - data_type: &DataType, -) -> Result<(), ArrowEncodingError> { - match data_type { - DataType::Int8 => downcast_builder!(builder, Int8Builder).append_null(), - DataType::Int16 => downcast_builder!(builder, Int16Builder).append_null(), - DataType::Int32 => downcast_builder!(builder, Int32Builder).append_null(), - DataType::Int64 => downcast_builder!(builder, Int64Builder).append_null(), - DataType::UInt8 => downcast_builder!(builder, UInt8Builder).append_null(), - DataType::UInt16 => downcast_builder!(builder, UInt16Builder).append_null(), - DataType::UInt32 => downcast_builder!(builder, UInt32Builder).append_null(), - DataType::UInt64 => downcast_builder!(builder, UInt64Builder).append_null(), - DataType::Float32 => downcast_builder!(builder, Float32Builder).append_null(), - DataType::Float64 => downcast_builder!(builder, Float64Builder).append_null(), - DataType::Boolean => downcast_builder!(builder, BooleanBuilder).append_null(), - DataType::Utf8 => downcast_builder!(builder, StringBuilder).append_null(), - DataType::Binary => downcast_builder!(builder, BinaryBuilder).append_null(), - DataType::Timestamp(TimeUnit::Second, _) => { - downcast_builder!(builder, TimestampSecondBuilder).append_null() - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - downcast_builder!(builder, TimestampMillisecondBuilder).append_null() - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - downcast_builder!(builder, TimestampMicrosecondBuilder).append_null() - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - downcast_builder!(builder, TimestampNanosecondBuilder).append_null() - } - DataType::List(_) => { - builder - .as_any_mut() - .downcast_mut::>>() - .expect("Failed to downcast to ListBuilder") - .append_null(); - } - DataType::Struct(_) => downcast_builder!(builder, StructBuilder).append_null(), - DataType::Map(_, _) => { - builder - .as_any_mut() - .downcast_mut::>>() - .expect("Failed to downcast to MapBuilder") - .append(false) - .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; - } - _ => {} - } - Ok(()) -} - -/// Recursively appends a VRL Value to an Arrow array builder. -fn append_value_to_builder( - builder: &mut dyn ArrayBuilder, - value: &Value, - field: &Field, -) -> Result<(), ArrowEncodingError> { - match (field.data_type(), value) { - // Integer types with range checking - (DataType::Int8, Value::Integer(i)) if *i >= i8::MIN as i64 && *i <= i8::MAX as i64 => { - downcast_builder!(builder, Int8Builder).append_value(*i as i8); - } - (DataType::Int16, Value::Integer(i)) if *i >= i16::MIN as i64 && *i <= i16::MAX as i64 => { - downcast_builder!(builder, Int16Builder).append_value(*i as i16); - } - (DataType::Int32, Value::Integer(i)) if *i >= i32::MIN as i64 && *i <= i32::MAX as i64 => { - downcast_builder!(builder, Int32Builder).append_value(*i as i32); - } - (DataType::Int64, Value::Integer(i)) => { - downcast_builder!(builder, Int64Builder).append_value(*i); - } - (DataType::UInt8, Value::Integer(i)) if *i >= 0 && *i <= u8::MAX as i64 => { - downcast_builder!(builder, UInt8Builder).append_value(*i as u8); - } - (DataType::UInt16, Value::Integer(i)) if *i >= 0 && *i <= u16::MAX as i64 => { - downcast_builder!(builder, UInt16Builder).append_value(*i as u16); - } - (DataType::UInt32, Value::Integer(i)) if *i >= 0 && *i <= u32::MAX as i64 => { - downcast_builder!(builder, UInt32Builder).append_value(*i as u32); - } - (DataType::UInt64, Value::Integer(i)) if *i >= 0 => { - downcast_builder!(builder, UInt64Builder).append_value(*i as u64); - } - // Float types - (DataType::Float32, Value::Float(f)) => { - downcast_builder!(builder, Float32Builder).append_value(f.into_inner() as f32); - } - (DataType::Float32, Value::Integer(i)) => { - downcast_builder!(builder, Float32Builder).append_value(*i as f32); - } - (DataType::Float64, Value::Float(f)) => { - downcast_builder!(builder, Float64Builder).append_value(f.into_inner()); - } - (DataType::Float64, Value::Integer(i)) => { - downcast_builder!(builder, Float64Builder).append_value(*i as f64); - } - // Boolean - (DataType::Boolean, Value::Boolean(b)) => { - downcast_builder!(builder, BooleanBuilder).append_value(*b); - } - // String types - (DataType::Utf8, Value::Bytes(bytes)) => match std::str::from_utf8(bytes) { - Ok(s) => downcast_builder!(builder, StringBuilder).append_value(s), - Err(_) => { - let s = String::from_utf8_lossy(bytes); - downcast_builder!(builder, StringBuilder).append_value(&s) - } - }, - // Automatic JSON serialization: Object -> String - (DataType::Utf8, Value::Object(obj)) => { - let json_str = value_to_json_string(&Value::Object(obj.clone()))?; - downcast_builder!(builder, StringBuilder).append_value(&json_str); - } - // Automatic JSON serialization: Array -> String - (DataType::Utf8, Value::Array(arr)) => { - let json_str = value_to_json_string(&Value::Array(arr.clone()))?; - downcast_builder!(builder, StringBuilder).append_value(&json_str); - } - (DataType::Binary, Value::Bytes(bytes)) => { - downcast_builder!(builder, BinaryBuilder).append_value(bytes); - } - - // Recursive types: List (Array) - (DataType::List(inner_field), Value::Array(arr)) => { - let list_builder = builder - .as_any_mut() - .downcast_mut::>>() - .ok_or_else(|| ArrowEncodingError::UnsupportedType { - field_name: field.name().clone(), - data_type: field.data_type().clone(), - })?; - - for item in arr.iter() { - append_value_to_builder(list_builder.values(), item, inner_field)?; - } - list_builder.append(true); - } - - // Recursive types: Struct (Tuple) - (DataType::Struct(fields), Value::Object(obj)) => { - let struct_builder = builder - .as_any_mut() - .downcast_mut::() - .ok_or_else(|| ArrowEncodingError::UnsupportedType { - field_name: field.name().clone(), - data_type: field.data_type().clone(), - })?; - - for (i, field) in fields.iter().enumerate() { - let key = format!("f{}", i); - let field_builder = &mut struct_builder.field_builders_mut()[i]; - match obj.get(key.as_str()) { - Some(val) => append_value_to_builder(field_builder.as_mut(), val, field)?, - None => append_null_to_builder(field_builder.as_mut(), field.data_type())?, - } - } - struct_builder.append(true); - } - - // Recursive types: Map (nested maps) - (DataType::Map(entries_field, _), Value::Object(obj)) => { - let map_builder = builder - .as_any_mut() - .downcast_mut::>>() - .ok_or_else(|| ArrowEncodingError::UnsupportedType { - field_name: field.name().clone(), - data_type: field.data_type().clone(), - })?; - - let DataType::Struct(entries_struct) = entries_field.data_type() else { - return Err(ArrowEncodingError::UnsupportedType { - field_name: field.name().clone(), - data_type: field.data_type().clone(), - }); - }; - - let value_field = &entries_struct[1]; - for (key, value) in obj.iter() { - map_builder.keys().append_value(key.as_ref()); - append_value_to_builder(map_builder.values(), value, value_field)?; - } - map_builder - .append(true) - .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; - } - - // Null/missing values - _ => { - if field.is_nullable() { - append_null_to_builder(builder, field.data_type())?; - } else { - return Err(ArrowEncodingError::NullConstraint { - field_name: field.name().clone(), - }); - } - } - } - Ok(()) -} - -/// Builds a List array from events for a given field. -/// Handles all nested types (including List) through recursive builder utilities. -pub(crate) fn build_list_array( - events: &[Event], - field_name: &str, - inner_field: &Field, - nullable: bool, -) -> Result { - let inner_builder = create_array_builder_for_type( - inner_field.data_type(), - events.len() * 4, // Estimate capacity - )?; - - let mut list_builder = ListBuilder::new(inner_builder); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - Some(Value::Array(arr)) => { - // Recursively append values (handles primitives, structs, maps, nested lists, etc.) - for value in arr.iter() { - append_value_to_builder(list_builder.values(), value, inner_field)?; - } - list_builder.append(true); - } - _ => { - if !nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: field_name.into(), - }); - } - list_builder.append_null(); - } - } - } - } - - Ok(Arc::new(list_builder.finish())) -} - -/// Builds a Struct array from events for a given field (used for Tuples). -pub(crate) fn build_struct_array( - events: &[Event], - field_name: &str, - fields: &Fields, - nullable: bool, -) -> Result { - // Create builders for each field - let field_builders: Vec> = fields - .iter() - .map(|f| create_array_builder_for_type(f.data_type(), events.len())) - .collect::, _>>()?; - - let mut struct_builder = StructBuilder::new(fields.clone(), field_builders); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - Some(Value::Object(obj)) => { - // Tuples are represented as objects with f0, f1, f2... keys - let field_builders = struct_builder.field_builders_mut(); - for (i, (field, builder)) in - fields.iter().zip(field_builders.iter_mut()).enumerate() - { - let key = format!("f{}", i); - if let Some(value) = obj.get(key.as_str()) { - append_value_to_builder(builder.as_mut(), value, field)?; - } else { - // If the struct field is non-nullable and the value is missing, error - if !field.is_nullable() { - return Err(ArrowEncodingError::NullConstraint { - field_name: format!("{}.{}", field_name, field.name()), - }); - } - append_null_to_builder(builder.as_mut(), field.data_type())?; - } - } - struct_builder.append(true); - } - _ => { - if !nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: field_name.into(), - }); - } - // Append nulls to all field builders - let field_builders = struct_builder.field_builders_mut(); - for (field, builder) in fields.iter().zip(field_builders.iter_mut()) { - append_null_to_builder(builder.as_mut(), field.data_type())?; - } - struct_builder.append(false); - } - } - } - } - - Ok(Arc::new(struct_builder.finish())) -} - -/// Builds a Map array from events for a given field. -pub(crate) fn build_map_array( - events: &[Event], - field_name: &str, - entries_field: &Field, - nullable: bool, -) -> Result { - // Extract key and value fields from entries struct - let entries_struct = match entries_field.data_type() { - DataType::Struct(fields) => fields, - _ => { - return Err(ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: entries_field.data_type().clone(), - }); - } - }; - - if entries_struct.len() != 2 { - return Err(ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: entries_field.data_type().clone(), - }); - } - - let value_field = &entries_struct[1]; - - // Create builders for keys and values - let key_builder = StringBuilder::with_capacity(events.len() * 4, 0); - let value_builder = create_array_builder_for_type(value_field.data_type(), events.len() * 4)?; - - let mut map_builder = MapBuilder::new(None, key_builder, value_builder); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - Some(Value::Object(obj)) => { - // Append each key-value pair - for (key, value) in obj.iter() { - map_builder.keys().append_value(key.as_ref()); - append_value_to_builder(map_builder.values(), value, value_field)?; - } - map_builder - .append(true) - .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; - } - _ => { - if !nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: field_name.into(), - }); - } - // For null maps, we need to call append(false) - map_builder - .append(false) - .map_err(|e| ArrowEncodingError::RecordBatchCreation { source: e })?; - } - } - } - } - - Ok(Arc::new(map_builder.finish())) -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::array::{ - Array, Int32Array, Int64Array, ListArray, MapArray, StringArray, StructArray, - }; - use arrow::datatypes::{DataType, Field, Fields}; - use vector_core::event::{Event, LogEvent, Value}; - use vrl::value::ObjectMap; - - #[test] - fn test_build_list_array_with_primitives() { - let mut log1 = LogEvent::default(); - log1.insert( - "numbers", - Value::Array(vec![ - Value::Integer(1), - Value::Integer(2), - Value::Integer(3), - ]), - ); - - let mut log2 = LogEvent::default(); - log2.insert( - "numbers", - Value::Array(vec![Value::Integer(4), Value::Integer(5)]), - ); - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - let inner_field = Field::new("item", DataType::Int64, true); - let result = build_list_array(&events, "numbers", &inner_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let list_array = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(list_array.len(), 2); - assert!(!list_array.is_null(0)); - assert!(!list_array.is_null(1)); - - // Check first list [1, 2, 3] - let first_list = list_array.value(0); - let int_array = first_list.as_any().downcast_ref::().unwrap(); - assert_eq!(int_array.len(), 3); - assert_eq!(int_array.value(0), 1); - assert_eq!(int_array.value(1), 2); - assert_eq!(int_array.value(2), 3); - - // Check second list [4, 5] - let second_list = list_array.value(1); - let int_array = second_list.as_any().downcast_ref::().unwrap(); - assert_eq!(int_array.len(), 2); - assert_eq!(int_array.value(0), 4); - assert_eq!(int_array.value(1), 5); - } - - #[test] - fn test_build_list_array_with_nulls() { - let mut log1 = LogEvent::default(); - log1.insert("numbers", Value::Array(vec![Value::Integer(1)])); - - let log2 = LogEvent::default(); // Missing field - - let mut log3 = LogEvent::default(); - log3.insert("numbers", Value::Array(vec![Value::Integer(3)])); - - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - - let inner_field = Field::new("item", DataType::Int64, true); - let result = build_list_array(&events, "numbers", &inner_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let list_array = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(list_array.len(), 3); - assert!(!list_array.is_null(0)); - assert!(list_array.is_null(1)); // Missing field - assert!(!list_array.is_null(2)); - } - - #[test] - fn test_build_struct_array_with_missing_fields() { - let mut tuple = ObjectMap::new(); - tuple.insert("f0".into(), Value::Bytes("partial".into())); - // f1 is missing - - let mut log = LogEvent::default(); - log.insert("tuple", Value::Object(tuple)); - - let events = vec![Event::Log(log)]; - - let fields = Fields::from(vec![ - Field::new("f0", DataType::Utf8, true), - Field::new("f1", DataType::Int64, true), // Nullable - ]); - - let result = build_struct_array(&events, "tuple", &fields, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let struct_array = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(struct_array.len(), 1); - - // f0 should have value - let f0_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f0_array.value(0), "partial"); - - // f1 should be null - let f1_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(f1_array.is_null(0)); - } - - #[test] - fn test_build_map_array_with_null() { - let mut map1 = ObjectMap::new(); - map1.insert("key1".into(), Value::Integer(100)); - - let mut log1 = LogEvent::default(); - log1.insert("map", Value::Object(map1)); - - let log2 = LogEvent::default(); // Missing map field - - let events = vec![Event::Log(log1), Event::Log(log2)]; - - let entries_field = Field::new( - "entries", - DataType::Struct(Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Int64, true), - ])), - false, - ); - - let result = build_map_array(&events, "map", &entries_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let map_array = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(map_array.len(), 2); - assert!(!map_array.is_null(0)); - assert!(map_array.is_null(1)); - } - - #[test] - fn test_build_map_array_empty_map() { - let mut log = LogEvent::default(); - log.insert("map", Value::Object(ObjectMap::new())); // Empty map - - let events = vec![Event::Log(log)]; - - let entries_field = Field::new( - "entries", - DataType::Struct(Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Int64, true), - ])), - false, - ); - - let result = build_map_array(&events, "map", &entries_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let map_array = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(map_array.len(), 1); - assert!(!map_array.is_null(0)); - - let map_value = map_array.value(0); - assert_eq!(map_value.len(), 0); // Empty but not null - } - - #[test] - fn test_json_serialization_object_to_string() { - let mut obj = ObjectMap::new(); - obj.insert("name".into(), Value::Bytes("test".into())); - obj.insert("count".into(), Value::Integer(42)); - - let mut log = LogEvent::default(); - log.insert("data", Value::Array(vec![Value::Object(obj)])); - - let events = vec![Event::Log(log)]; - - // Schema expects List - let inner_field = Field::new("item", DataType::Utf8, true); - let result = build_list_array(&events, "data", &inner_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let list_array = array.as_any().downcast_ref::().unwrap(); - - let values = list_array.value(0); - let string_array = values.as_any().downcast_ref::().unwrap(); - let json_str = string_array.value(0); - - // Should be JSON serialized - assert!(json_str.contains("\"name\"")); - assert!(json_str.contains("test")); - assert!(json_str.contains("\"count\"")); - assert!(json_str.contains("42")); - } - - #[test] - fn test_json_serialization_array_to_string() { - let mut log = LogEvent::default(); - log.insert( - "data", - Value::Array(vec![Value::Array(vec![ - Value::Integer(1), - Value::Integer(2), - Value::Integer(3), - ])]), - ); - - let events = vec![Event::Log(log)]; - - // Schema expects List - let inner_field = Field::new("item", DataType::Utf8, true); - let result = build_list_array(&events, "data", &inner_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let list_array = array.as_any().downcast_ref::().unwrap(); - - let values = list_array.value(0); - let string_array = values.as_any().downcast_ref::().unwrap(); - let json_str = string_array.value(0); - - // Should be JSON serialized array - assert_eq!(json_str, "[1,2,3]"); - } - - #[test] - fn test_nested_list_of_structs() { - let mut tuple1 = ObjectMap::new(); - tuple1.insert("f0".into(), Value::Integer(1)); - tuple1.insert("f1".into(), Value::Bytes("a".into())); - - let mut tuple2 = ObjectMap::new(); - tuple2.insert("f0".into(), Value::Integer(2)); - tuple2.insert("f1".into(), Value::Bytes("b".into())); - - let mut log = LogEvent::default(); - log.insert( - "data", - Value::Array(vec![Value::Object(tuple1), Value::Object(tuple2)]), - ); - - let events = vec![Event::Log(log)]; - - let struct_fields = Fields::from(vec![ - Field::new("f0", DataType::Int32, true), - Field::new("f1", DataType::Utf8, true), - ]); - - let inner_field = Field::new("item", DataType::Struct(struct_fields), true); - let result = build_list_array(&events, "data", &inner_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let list_array = array.as_any().downcast_ref::().unwrap(); - - let values = list_array.value(0); - let struct_array = values.as_any().downcast_ref::().unwrap(); - - assert_eq!(struct_array.len(), 2); - - let f0_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f0_array.value(0), 1); - assert_eq!(f0_array.value(1), 2); - - let f1_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f1_array.value(0), "a"); - assert_eq!(f1_array.value(1), "b"); - } - - #[test] - fn test_nested_struct_with_list() { - let mut tuple = ObjectMap::new(); - tuple.insert("f0".into(), Value::Bytes("name".into())); - tuple.insert( - "f1".into(), - Value::Array(vec![ - Value::Integer(1), - Value::Integer(2), - Value::Integer(3), - ]), - ); - - let mut log = LogEvent::default(); - log.insert("data", Value::Object(tuple)); - - let events = vec![Event::Log(log)]; - - let fields = Fields::from(vec![ - Field::new("f0", DataType::Utf8, true), - Field::new( - "f1", - DataType::List(Field::new("item", DataType::Int64, true).into()), - true, - ), - ]); - - let result = build_struct_array(&events, "data", &fields, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let struct_array = array.as_any().downcast_ref::().unwrap(); - - // Check f0 (string) - let f0_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f0_array.value(0), "name"); - - // Check f1 (list) - let f1_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - let list_values = f1_array.value(0); - let int_array = list_values.as_any().downcast_ref::().unwrap(); - assert_eq!(int_array.len(), 3); - assert_eq!(int_array.value(0), 1); - assert_eq!(int_array.value(1), 2); - assert_eq!(int_array.value(2), 3); - } - - #[test] - fn test_nested_map_with_struct_values() { - let mut struct_value = ObjectMap::new(); - struct_value.insert("f0".into(), Value::Integer(42)); - struct_value.insert("f1".into(), Value::Bytes("test".into())); - - let mut map = ObjectMap::new(); - map.insert("key1".into(), Value::Object(struct_value)); - - let mut log = LogEvent::default(); - log.insert("data", Value::Object(map)); - - let events = vec![Event::Log(log)]; - - let struct_fields = Fields::from(vec![ - Field::new("f0", DataType::Int64, true), - Field::new("f1", DataType::Utf8, true), - ]); - - let entries_field = Field::new( - "entries", - DataType::Struct(Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Struct(struct_fields), true), - ])), - false, - ); - - let result = build_map_array(&events, "data", &entries_field, true); - - assert!(result.is_ok()); - let array = result.unwrap(); - let map_array = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(map_array.len(), 1); - assert!(!map_array.is_null(0)); - - let map_value = map_array.value(0); - assert_eq!(map_value.len(), 1); - - // Verify struct values - let struct_array = map_array - .values() - .as_any() - .downcast_ref::() - .unwrap(); - let f0_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f0_array.value(0), 42); - - let f1_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f1_array.value(0), "test"); - } -} diff --git a/lib/codecs/src/encoding/format/arrow/types/decimal.rs b/lib/codecs/src/encoding/format/arrow/types/decimal.rs deleted file mode 100644 index a39fd32d12dc7..0000000000000 --- a/lib/codecs/src/encoding/format/arrow/types/decimal.rs +++ /dev/null @@ -1,116 +0,0 @@ -use arrow::{ - array::{ArrayRef, Decimal128Builder, Decimal256Builder}, - datatypes::{DataType, i256}, -}; -use rust_decimal::Decimal; -use std::sync::Arc; -use vector_core::event::{Event, Value}; - -use crate::encoding::format::arrow::ArrowEncodingError; - -/// Macro to handle appending null or returning an error for non-nullable fields. -macro_rules! handle_null_constraints { - ($builder:expr, $nullable:expr, $field_name:expr) => {{ - if !$nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: $field_name.into(), - }); - } - $builder.append_null(); - }}; -} - -pub(crate) fn build_decimal128_array( - events: &[Event], - field_name: &str, - precision: u8, - scale: i8, - nullable: bool, -) -> Result { - let mut builder = Decimal128Builder::with_capacity(events.len()) - .with_precision_and_scale(precision, scale) - .map_err(|_| ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: DataType::Decimal128(precision, scale), - })?; - - let target_scale = scale.unsigned_abs() as u32; - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - match log.get(field_name) { - Some(Value::Float(f)) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(mantissa); - appended = true; - } - } - Some(Value::Integer(i)) => { - let mut decimal = Decimal::from(*i); - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(mantissa); - appended = true; - } - _ => {} - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -pub(crate) fn build_decimal256_array( - events: &[Event], - field_name: &str, - precision: u8, - scale: i8, - nullable: bool, -) -> Result { - let mut builder = Decimal256Builder::with_capacity(events.len()) - .with_precision_and_scale(precision, scale) - .map_err(|_| ArrowEncodingError::UnsupportedType { - field_name: field_name.into(), - data_type: DataType::Decimal256(precision, scale), - })?; - - let target_scale = scale.unsigned_abs() as u32; - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - match log.get(field_name) { - Some(Value::Float(f)) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - // rust_decimal does not support i256 natively so we upcast here - builder.append_value(i256::from_i128(mantissa)); - appended = true; - } - } - Some(Value::Integer(i)) => { - let mut decimal = Decimal::from(*i); - decimal.rescale(target_scale); - let mantissa = decimal.mantissa(); - builder.append_value(i256::from_i128(mantissa)); - appended = true; - } - _ => {} - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} diff --git a/lib/codecs/src/encoding/format/arrow/types/primitives.rs b/lib/codecs/src/encoding/format/arrow/types/primitives.rs deleted file mode 100644 index 8978c014d1855..0000000000000 --- a/lib/codecs/src/encoding/format/arrow/types/primitives.rs +++ /dev/null @@ -1,187 +0,0 @@ -use arrow::array::{ - ArrayRef, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, - Int16Builder, Int32Builder, Int64Builder, StringBuilder, UInt8Builder, UInt16Builder, - UInt32Builder, UInt64Builder, -}; -use std::sync::Arc; -use vector_core::event::{Event, Value}; - -use crate::encoding::format::arrow::ArrowEncodingError; - -/// Macro to handle appending null or returning an error for non-nullable fields. -macro_rules! handle_null_constraints { - ($builder:expr, $nullable:expr, $field_name:expr) => {{ - if !$nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: $field_name.into(), - }); - } - $builder.append_null(); - }}; -} - -/// Macro to generate a `build_*_array` function for primitive types. -macro_rules! define_build_primitive_array_fn { - ( - $fn_name:ident, // The function name (e.g., build_int8_array) - $builder_ty:ty, // The builder type (e.g., Int8Builder) - // One or more match arms for valid Value types - $( $value_pat:pat $(if $guard:expr)? => $append_expr:expr ),+ - ) => { - pub(crate) fn $fn_name( - events: &[Event], - field_name: &str, - nullable: bool, - ) -> Result { - let mut builder = <$builder_ty>::with_capacity(events.len()); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - $( - $value_pat $(if $guard)? => builder.append_value($append_expr), - )+ - // All other patterns are treated as null/invalid - _ => handle_null_constraints!(builder, nullable, field_name), - } - } - } - Ok(Arc::new(builder.finish())) - } - }; -} - -pub(crate) fn build_string_array( - events: &[Event], - field_name: &str, - nullable: bool, -) -> Result { - let mut builder = StringBuilder::with_capacity(events.len(), 0); - - for event in events { - if let Event::Log(log) = event { - let mut appended = false; - if let Some(value) = log.get(field_name) { - match value { - Value::Bytes(bytes) => { - // Attempt direct UTF-8 conversion first, fallback to lossy - match std::str::from_utf8(bytes) { - Ok(s) => builder.append_value(s), - Err(_) => builder.append_value(&String::from_utf8_lossy(bytes)), - } - appended = true; - } - Value::Object(obj) => { - if let Ok(s) = serde_json::to_string(&obj) { - builder.append_value(s); - appended = true; - } - } - Value::Array(arr) => { - if let Ok(s) = serde_json::to_string(&arr) { - builder.append_value(s); - appended = true; - } - } - _ => { - builder.append_value(&value.to_string_lossy()); - appended = true; - } - } - } - - if !appended { - handle_null_constraints!(builder, nullable, field_name); - } - } - } - - Ok(Arc::new(builder.finish())) -} - -define_build_primitive_array_fn!( - build_int8_array, - Int8Builder, - Some(Value::Integer(i)) if *i >= i8::MIN as i64 && *i <= i8::MAX as i64 => *i as i8 -); - -define_build_primitive_array_fn!( - build_int16_array, - Int16Builder, - Some(Value::Integer(i)) if *i >= i16::MIN as i64 && *i <= i16::MAX as i64 => *i as i16 -); - -define_build_primitive_array_fn!( - build_int32_array, - Int32Builder, - Some(Value::Integer(i)) if *i >= i32::MIN as i64 && *i <= i32::MAX as i64 => *i as i32 -); - -define_build_primitive_array_fn!( - build_int64_array, - Int64Builder, - Some(Value::Integer(i)) => *i -); - -define_build_primitive_array_fn!( - build_uint8_array, - UInt8Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u8::MAX as i64 => *i as u8 -); - -define_build_primitive_array_fn!( - build_uint16_array, - UInt16Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u16::MAX as i64 => *i as u16 -); - -define_build_primitive_array_fn!( - build_uint32_array, - UInt32Builder, - Some(Value::Integer(i)) if *i >= 0 && *i <= u32::MAX as i64 => *i as u32 -); - -define_build_primitive_array_fn!( - build_uint64_array, - UInt64Builder, - Some(Value::Integer(i)) if *i >= 0 => *i as u64 -); - -define_build_primitive_array_fn!( - build_float32_array, - Float32Builder, - Some(Value::Float(f)) => f.into_inner() as f32, - Some(Value::Integer(i)) => *i as f32 -); - -define_build_primitive_array_fn!( - build_float64_array, - Float64Builder, - Some(Value::Float(f)) => f.into_inner(), - Some(Value::Integer(i)) => *i as f64 -); - -define_build_primitive_array_fn!( - build_boolean_array, - BooleanBuilder, - Some(Value::Boolean(b)) => *b -); - -pub(crate) fn build_binary_array( - events: &[Event], - field_name: &str, - nullable: bool, -) -> Result { - let mut builder = BinaryBuilder::with_capacity(events.len(), 0); - - for event in events { - if let Event::Log(log) = event { - match log.get(field_name) { - Some(Value::Bytes(bytes)) => builder.append_value(bytes), - _ => handle_null_constraints!(builder, nullable, field_name), - } - } - } - - Ok(Arc::new(builder.finish())) -} diff --git a/lib/codecs/src/encoding/format/arrow/types/temporal.rs b/lib/codecs/src/encoding/format/arrow/types/temporal.rs deleted file mode 100644 index 72e5578dd27cb..0000000000000 --- a/lib/codecs/src/encoding/format/arrow/types/temporal.rs +++ /dev/null @@ -1,85 +0,0 @@ -use arrow::{ - array::{ - ArrayRef, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, - TimestampNanosecondBuilder, TimestampSecondBuilder, - }, - datatypes::TimeUnit, -}; -use chrono::{DateTime, Utc}; -use std::sync::Arc; -use vector_core::event::{Event, Value}; - -use crate::encoding::format::arrow::ArrowEncodingError; - -pub(crate) fn extract_timestamp(value: &Value) -> Option> { - match value { - Value::Timestamp(ts) => Some(*ts), - Value::Bytes(bytes) => std::str::from_utf8(bytes) - .ok() - .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok()) - .map(|dt| dt.with_timezone(&Utc)), - _ => None, - } -} - -pub(crate) fn build_timestamp_array( - events: &[Event], - field_name: &str, - time_unit: TimeUnit, - nullable: bool, -) -> Result { - macro_rules! build_array { - ($builder:ty, $converter:expr) => {{ - let mut builder = <$builder>::with_capacity(events.len()); - for event in events { - if let Event::Log(log) = event { - let value_to_append = log.get(field_name).and_then(|value| { - // First, try to extract it as a native or string timestamp - if let Some(ts) = extract_timestamp(value) { - $converter(&ts) - } - // Else, fall back to a raw integer - else if let Value::Integer(i) = value { - Some(*i) - } - // Else, it's an unsupported type (e.g., Bool, Float) - else { - None - } - }); - - if value_to_append.is_none() && !nullable { - return Err(ArrowEncodingError::NullConstraint { - field_name: field_name.into(), - }); - } - - builder.append_option(value_to_append); - } - } - Ok(Arc::new(builder.finish())) - }}; - } - - match time_unit { - TimeUnit::Second => { - build_array!(TimestampSecondBuilder, |ts: &DateTime| Some( - ts.timestamp() - )) - } - TimeUnit::Millisecond => { - build_array!(TimestampMillisecondBuilder, |ts: &DateTime| Some( - ts.timestamp_millis() - )) - } - TimeUnit::Microsecond => { - build_array!(TimestampMicrosecondBuilder, |ts: &DateTime| Some( - ts.timestamp_micros() - )) - } - TimeUnit::Nanosecond => { - build_array!(TimestampNanosecondBuilder, |ts: &DateTime| ts - .timestamp_nanos_opt()) - } - } -} From 0eaead563f4299ad1484ca01775993cd5bdd0d1f Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 13:12:42 +0800 Subject: [PATCH 05/11] refactor: use append_option where possible --- .../src/encoding/format/arrow/builder.rs | 143 +++++++----------- 1 file changed, 54 insertions(+), 89 deletions(-) diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs index 157fbb6b5745c..583aa9701aed6 100644 --- a/lib/codecs/src/encoding/format/arrow/builder.rs +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -79,22 +79,6 @@ macro_rules! append_null_match { }; } -/// Macro to simplify integer/float appending with bounds checking and casting. -macro_rules! append_primitive { - // Simple case: no bounds checking - ($builder:expr, $builder_type:ty, $val:expr, $cast_type:ty) => {{ - downcast_builder!($builder, $builder_type).append_value(*$val as $cast_type); - }}; - // With bounds checking - ($builder:expr, $builder_type:ty, $val:expr, $cast_type:ty, $min:expr, $max:expr) => {{ - if *$val >= $min as i64 && *$val <= $max as i64 { - downcast_builder!($builder, $builder_type).append_value(*$val as $cast_type); - } else { - downcast_builder!($builder, $builder_type).append_null(); - } - }}; -} - /// Helper function to serialize a Value to JSON string. /// This is used when the schema expects a string but the data contains complex types. fn value_to_json_string(value: &Value) -> Result { @@ -151,48 +135,51 @@ fn append_value_to_builder( match (field.data_type(), value) { // Integer types with range checking (DataType::Int8, Value::Integer(i)) => { - append_primitive!(builder, Int8Builder, i, i8, i8::MIN, i8::MAX) + let val = (*i >= i8::MIN as i64 && *i <= i8::MAX as i64).then_some(*i as i8); + downcast_builder!(builder, Int8Builder).append_option(val); } (DataType::Int16, Value::Integer(i)) => { - append_primitive!(builder, Int16Builder, i, i16, i16::MIN, i16::MAX) + let val = (*i >= i16::MIN as i64 && *i <= i16::MAX as i64).then_some(*i as i16); + downcast_builder!(builder, Int16Builder).append_option(val); } (DataType::Int32, Value::Integer(i)) => { - append_primitive!(builder, Int32Builder, i, i32, i32::MIN, i32::MAX) + let val = (*i >= i32::MIN as i64 && *i <= i32::MAX as i64).then_some(*i as i32); + downcast_builder!(builder, Int32Builder).append_option(val); + } + (DataType::Int64, Value::Integer(i)) => { + downcast_builder!(builder, Int64Builder).append_value(*i); } - (DataType::Int64, Value::Integer(i)) => append_primitive!(builder, Int64Builder, i, i64), // Unsigned integer types with range checking (DataType::UInt8, Value::Integer(i)) => { - append_primitive!(builder, UInt8Builder, i, u8, 0, u8::MAX) + let val = (*i >= 0 && *i <= u8::MAX as i64).then_some(*i as u8); + downcast_builder!(builder, UInt8Builder).append_option(val); } (DataType::UInt16, Value::Integer(i)) => { - append_primitive!(builder, UInt16Builder, i, u16, 0, u16::MAX) + let val = (*i >= 0 && *i <= u16::MAX as i64).then_some(*i as u16); + downcast_builder!(builder, UInt16Builder).append_option(val); } (DataType::UInt32, Value::Integer(i)) => { - append_primitive!(builder, UInt32Builder, i, u32, 0, u32::MAX) + let val = (*i >= 0 && *i <= u32::MAX as i64).then_some(*i as u32); + downcast_builder!(builder, UInt32Builder).append_option(val); } (DataType::UInt64, Value::Integer(i)) => { - if *i >= 0 { - append_primitive!(builder, UInt64Builder, i, u64); - } else { - downcast_builder!(builder, UInt64Builder).append_null(); - } + let val = (*i >= 0).then_some(*i as u64); + downcast_builder!(builder, UInt64Builder).append_option(val); } // Float types (DataType::Float32, Value::Float(f)) => { - let val = f.into_inner(); - downcast_builder!(builder, Float32Builder).append_value(val as f32); + downcast_builder!(builder, Float32Builder).append_value(f.into_inner() as f32); } (DataType::Float32, Value::Integer(i)) => { - append_primitive!(builder, Float32Builder, i, f32) + downcast_builder!(builder, Float32Builder).append_value(*i as f32); } (DataType::Float64, Value::Float(f)) => { - let val = f.into_inner(); - downcast_builder!(builder, Float64Builder).append_value(val); + downcast_builder!(builder, Float64Builder).append_value(f.into_inner()); } (DataType::Float64, Value::Integer(i)) => { - append_primitive!(builder, Float64Builder, i, f64) + downcast_builder!(builder, Float64Builder).append_value(*i as f64); } // Boolean @@ -249,30 +236,22 @@ fn append_value_to_builder( } }; - match (time_unit, converted_value) { - (TimeUnit::Second, Some(val)) => { - downcast_builder!(builder, TimestampSecondBuilder).append_value(val); - } - (TimeUnit::Millisecond, Some(val)) => { - downcast_builder!(builder, TimestampMillisecondBuilder).append_value(val); + match time_unit { + TimeUnit::Second => { + downcast_builder!(builder, TimestampSecondBuilder) + .append_option(converted_value); } - (TimeUnit::Microsecond, Some(val)) => { - downcast_builder!(builder, TimestampMicrosecondBuilder).append_value(val); + TimeUnit::Millisecond => { + downcast_builder!(builder, TimestampMillisecondBuilder) + .append_option(converted_value); } - (TimeUnit::Nanosecond, Some(val)) => { - downcast_builder!(builder, TimestampNanosecondBuilder).append_value(val); + TimeUnit::Microsecond => { + downcast_builder!(builder, TimestampMicrosecondBuilder) + .append_option(converted_value); } - (TimeUnit::Second, None) => { - downcast_builder!(builder, TimestampSecondBuilder).append_null(); - } - (TimeUnit::Millisecond, None) => { - downcast_builder!(builder, TimestampMillisecondBuilder).append_null(); - } - (TimeUnit::Microsecond, None) => { - downcast_builder!(builder, TimestampMicrosecondBuilder).append_null(); - } - (TimeUnit::Nanosecond, None) => { - downcast_builder!(builder, TimestampNanosecondBuilder).append_null(); + TimeUnit::Nanosecond => { + downcast_builder!(builder, TimestampNanosecondBuilder) + .append_option(converted_value); } } } @@ -281,57 +260,43 @@ fn append_value_to_builder( (DataType::Decimal128(_precision, scale), value) => { use rust_decimal::Decimal; - let decimal_builder = builder - .as_any_mut() - .downcast_mut::() - .expect("Failed to downcast to Decimal128Builder"); - let target_scale = scale.unsigned_abs() as u32; - match value { - Value::Float(f) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - decimal_builder.append_value(decimal.mantissa()); - } else { - decimal_builder.append_null(); - } - } + let mantissa = match value { + Value::Float(f) => Decimal::try_from(f.into_inner()).ok().map(|mut d| { + d.rescale(target_scale); + d.mantissa() + }), Value::Integer(i) => { let mut decimal = Decimal::from(*i); decimal.rescale(target_scale); - decimal_builder.append_value(decimal.mantissa()); + Some(decimal.mantissa()) } - _ => decimal_builder.append_null(), - } + _ => None, + }; + + downcast_builder!(builder, Decimal128Builder).append_option(mantissa); } (DataType::Decimal256(_precision, scale), value) => { use rust_decimal::Decimal; - let decimal_builder = builder - .as_any_mut() - .downcast_mut::() - .expect("Failed to downcast to Decimal256Builder"); - let target_scale = scale.unsigned_abs() as u32; - match value { - Value::Float(f) => { - if let Ok(mut decimal) = Decimal::try_from(f.into_inner()) { - decimal.rescale(target_scale); - decimal_builder.append_value(i256::from_i128(decimal.mantissa())); - } else { - decimal_builder.append_null(); - } - } + let mantissa = match value { + Value::Float(f) => Decimal::try_from(f.into_inner()).ok().map(|mut d| { + d.rescale(target_scale); + i256::from_i128(d.mantissa()) + }), Value::Integer(i) => { let mut decimal = Decimal::from(*i); decimal.rescale(target_scale); - decimal_builder.append_value(i256::from_i128(decimal.mantissa())); + Some(i256::from_i128(decimal.mantissa())) } - _ => decimal_builder.append_null(), - } + _ => None, + }; + + downcast_builder!(builder, Decimal256Builder).append_option(mantissa); } // Complex types From a7a53a65c0e9756562c1a17cfc168ff42e660f21 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 13:18:46 +0800 Subject: [PATCH 06/11] refactor: use iterator method for building record batch --- lib/codecs/src/encoding/format/arrow/builder.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs index 583aa9701aed6..262cb89f437ee 100644 --- a/lib/codecs/src/encoding/format/arrow/builder.rs +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -393,13 +393,11 @@ pub(crate) fn build_record_batch( schema: SchemaRef, events: &[Event], ) -> Result { - let num_fields = schema.fields().len(); - let mut columns: Vec = Vec::with_capacity(num_fields); - - for field in schema.fields() { - let array = build_array_for_field(events, field)?; - columns.push(array); - } + let columns: Vec = schema + .fields() + .iter() + .map(|field| build_array_for_field(events, field)) + .collect::>()?; RecordBatch::try_new(schema, columns) .map_err(|source| ArrowEncodingError::RecordBatchCreation { source }) From 7df285064a17eebb30bab168bec6a967942c7fe2 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 15:02:07 +0800 Subject: [PATCH 07/11] chore: remove duplicated code in arrow tests --- lib/codecs/src/encoding/format/arrow/tests.rs | 3394 ++++++++--------- 1 file changed, 1560 insertions(+), 1834 deletions(-) diff --git a/lib/codecs/src/encoding/format/arrow/tests.rs b/lib/codecs/src/encoding/format/arrow/tests.rs index a3374e968a50c..d3277219cc0c1 100644 --- a/lib/codecs/src/encoding/format/arrow/tests.rs +++ b/lib/codecs/src/encoding/format/arrow/tests.rs @@ -7,2128 +7,1854 @@ use arrow::{ }, datatypes::{DataType, Field, Fields, Schema, SchemaRef, TimeUnit}, ipc::reader::StreamReader, + record_batch::RecordBatch, }; use chrono::Utc; use std::{io::Cursor, sync::Arc}; use vector_core::event::{Event, LogEvent, Value}; -#[test] -fn test_encode_all_types() { - use arrow::array::{ - Decimal128Array, ListArray, MapArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array, - }; - use vrl::value::ObjectMap; - - let now = Utc::now(); - - // Create a struct (tuple) value - let mut tuple_value = ObjectMap::new(); - tuple_value.insert("f0".into(), Value::Bytes("nested_str".into())); - tuple_value.insert("f1".into(), Value::Integer(999)); - - // Create a list value - let list_value = Value::Array(vec![ - Value::Integer(1), - Value::Integer(2), - Value::Integer(3), - ]); - - // Create a map value - let mut map_value = ObjectMap::new(); - map_value.insert("key1".into(), Value::Integer(100)); - map_value.insert("key2".into(), Value::Integer(200)); +/// Helper to encode events and return the decoded RecordBatch +fn encode_and_decode( + events: Vec, + schema: SchemaRef, +) -> Result> { + let bytes = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema)))?; + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None)?; + Ok(reader.next().unwrap()?) +} +/// Create a simple event from key-value pairs +fn create_event(fields: Vec<(&str, V)>) -> Event +where + V: Into, +{ let mut log = LogEvent::default(); - // Primitive types - log.insert("string_field", "test"); - log.insert("int8_field", 127); - log.insert("int16_field", 32000); - log.insert("int32_field", 1000000); - log.insert("int64_field", 42); - log.insert("uint8_field", 255); - log.insert("uint16_field", 65535); - log.insert("uint32_field", 4000000); - log.insert("uint64_field", 9000000000_i64); - log.insert("float32_field", 3.15); - log.insert("float64_field", 3.15); - log.insert("bool_field", true); - log.insert("bytes_field", bytes::Bytes::from("binary")); - log.insert("timestamp_field", now); - log.insert("decimal_field", 99.99); - // Complex types - log.insert("list_field", list_value); - log.insert("struct_field", Value::Object(tuple_value)); - log.insert("map_field", Value::Object(map_value)); - - let events = vec![Event::Log(log)]; - - // Build schema with all supported types - let struct_fields = arrow::datatypes::Fields::from(vec![ - Field::new("f0", DataType::Utf8, true), - Field::new("f1", DataType::Int64, true), - ]); - - let map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Int64, true), - ])), - false, - ); - - let schema = SchemaRef::new(Schema::new(vec![ - Field::new("string_field", DataType::Utf8, true), - Field::new("int8_field", DataType::Int8, true), - Field::new("int16_field", DataType::Int16, true), - Field::new("int32_field", DataType::Int32, true), - Field::new("int64_field", DataType::Int64, true), - Field::new("uint8_field", DataType::UInt8, true), - Field::new("uint16_field", DataType::UInt16, true), - Field::new("uint32_field", DataType::UInt32, true), - Field::new("uint64_field", DataType::UInt64, true), - Field::new("float32_field", DataType::Float32, true), - Field::new("float64_field", DataType::Float64, true), - Field::new("bool_field", DataType::Boolean, true), - Field::new("bytes_field", DataType::Binary, true), - Field::new( - "timestamp_field", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new("decimal_field", DataType::Decimal128(10, 2), true), - Field::new( - "list_field", - DataType::List(Field::new("item", DataType::Int64, true).into()), - true, - ), - Field::new("struct_field", DataType::Struct(struct_fields), true), - Field::new("map_field", DataType::Map(map_entries.into(), false), true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok(), "Failed to encode: {:?}", result.err()); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + for (key, value) in fields { + log.insert(key, value.into()); + } + Event::Log(log) +} - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 18); +/// Assert a column has expected integer values (with optional nulls) +fn assert_int64_column(batch: &RecordBatch, col_index: usize, expected: &[Option]) { + let array = batch + .column(col_index) + .as_any() + .downcast_ref::() + .expect("Expected Int64Array"); - // Verify all primitive types - assert_eq!( - batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - "test" - ); assert_eq!( - batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 127 + array.len(), + expected.len(), + "Array length mismatch at column {}", + col_index ); - assert_eq!( - batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 32000 - ); - assert_eq!( - batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 1000000 - ); - assert_eq!( - batch - .column(4) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 42 - ); - assert_eq!( - batch - .column(5) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 255 - ); - assert_eq!( - batch - .column(6) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 65535 - ); - assert_eq!( - batch - .column(7) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 4000000 - ); - assert_eq!( - batch - .column(8) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 9000000000 - ); - assert!( - (batch - .column(9) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - - 3.15) - .abs() - < 0.001 - ); - assert!( - (batch - .column(10) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - - 3.15) - .abs() - < 0.001 - ); - assert!( - batch - .column(11) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - assert_eq!( - batch - .column(12) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - b"binary" - ); - assert_eq!( - batch - .column(13) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - now.timestamp_millis() - ); - - let decimal_array: &arrow::array::PrimitiveArray = batch - .column(14) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(decimal_array.value(0), 9999); - let list_array = batch - .column(15) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!list_array.is_null(0)); - let list_value = list_array.value(0); - assert_eq!(list_value.len(), 3); - let int_array = list_value.as_any().downcast_ref::().unwrap(); - assert_eq!(int_array.value(0), 1); - assert_eq!(int_array.value(1), 2); - assert_eq!(int_array.value(2), 3); - - // Verify struct field - let struct_array = batch - .column(16) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!struct_array.is_null(0)); - let f0_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f0_array.value(0), "nested_str"); - let f1_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f1_array.value(0), 999); - - // Verify map field - let map_array = batch - .column(17) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!map_array.is_null(0)); - let map_value = map_array.value(0); - assert_eq!(map_value.len(), 2); + for (i, &expected_val) in expected.iter().enumerate() { + match expected_val { + Some(val) => { + assert!( + !array.is_null(i), + "Expected value {} at index {}, got null", + val, + i + ); + assert_eq!(array.value(i), val, "Value mismatch at index {}", i); + } + None => assert!(array.is_null(i), "Expected null at index {}, got value", i), + } + } } -#[test] -fn test_encode_null_values() { - let mut log1 = LogEvent::default(); - log1.insert("field_a", 1); - // field_b is missing +/// Create a schema with a single field +fn single_field_schema(name: &str, data_type: DataType, nullable: bool) -> SchemaRef { + SchemaRef::new(Schema::new(vec![Field::new(name, data_type, nullable)])) +} - let mut log2 = LogEvent::default(); - log2.insert("field_b", 2); - // field_a is missing +/// Assert a primitive value at a specific column and row +macro_rules! assert_primitive_value { + ($batch:expr, $col:expr, $row:expr, $array_type:ty, $expected:expr) => { + assert_eq!( + $batch + .column($col) + .as_any() + .downcast_ref::<$array_type>() + .unwrap() + .value($row), + $expected + ) + }; +} - let events = vec![Event::Log(log1), Event::Log(log2)]; +mod comprehensive { + use super::*; - let schema = SchemaRef::new(Schema::new(vec![ - Field::new("field_a", DataType::Int64, true), - Field::new("field_b", DataType::Int64, true), - ])); + #[test] + fn test_encode_all_types() { + use arrow::array::{ + Decimal128Array, ListArray, MapArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array, + }; + use vrl::value::ObjectMap; - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let now = Utc::now(); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + // Create a struct (tuple) value + let mut tuple_value = ObjectMap::new(); + tuple_value.insert("f0".into(), Value::Bytes("nested_str".into())); + tuple_value.insert("f1".into(), Value::Integer(999)); - assert_eq!(batch.num_rows(), 2); + // Create a list value + let list_value = Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]); + + // Create a map value + let mut map_value = ObjectMap::new(); + map_value.insert("key1".into(), Value::Integer(100)); + map_value.insert("key2".into(), Value::Integer(200)); + + let mut log = LogEvent::default(); + // Primitive types + log.insert("string_field", "test"); + log.insert("int8_field", 127); + log.insert("int16_field", 32000); + log.insert("int32_field", 1000000); + log.insert("int64_field", 42); + log.insert("uint8_field", 255); + log.insert("uint16_field", 65535); + log.insert("uint32_field", 4000000); + log.insert("uint64_field", 9000000000_i64); + log.insert("float32_field", 3.15); + log.insert("float64_field", 3.15); + log.insert("bool_field", true); + log.insert("bytes_field", bytes::Bytes::from("binary")); + log.insert("timestamp_field", now); + log.insert("decimal_field", 99.99); + // Complex types + log.insert("list_field", list_value); + log.insert("struct_field", Value::Object(tuple_value)); + log.insert("map_field", Value::Object(map_value)); + + let events = vec![Event::Log(log)]; + + // Build schema with all supported types + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int64, true), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int64, true), + ])), + false, + ); - let field_a = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(field_a.value(0), 1); - assert!(field_a.is_null(1)); + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("string_field", DataType::Utf8, true), + Field::new("int8_field", DataType::Int8, true), + Field::new("int16_field", DataType::Int16, true), + Field::new("int32_field", DataType::Int32, true), + Field::new("int64_field", DataType::Int64, true), + Field::new("uint8_field", DataType::UInt8, true), + Field::new("uint16_field", DataType::UInt16, true), + Field::new("uint32_field", DataType::UInt32, true), + Field::new("uint64_field", DataType::UInt64, true), + Field::new("float32_field", DataType::Float32, true), + Field::new("float64_field", DataType::Float64, true), + Field::new("bool_field", DataType::Boolean, true), + Field::new("bytes_field", DataType::Binary, true), + Field::new( + "timestamp_field", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new("decimal_field", DataType::Decimal128(10, 2), true), + Field::new( + "list_field", + DataType::List(Field::new("item", DataType::Int64, true).into()), + true, + ), + Field::new("struct_field", DataType::Struct(struct_fields), true), + Field::new("map_field", DataType::Map(map_entries.into(), false), true), + ])); + + let batch = encode_and_decode(events, schema).expect("Failed to encode"); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 18); + + // Verify all primitive types + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + "test" + ); + assert_primitive_value!(batch, 1, 0, arrow::array::Int8Array, 127); + assert_primitive_value!(batch, 2, 0, arrow::array::Int16Array, 32000); + assert_primitive_value!(batch, 3, 0, arrow::array::Int32Array, 1000000); + assert_primitive_value!(batch, 4, 0, Int64Array, 42); + assert_primitive_value!(batch, 5, 0, UInt8Array, 255); + assert_primitive_value!(batch, 6, 0, UInt16Array, 65535); + assert_primitive_value!(batch, 7, 0, UInt32Array, 4000000); + assert_primitive_value!(batch, 8, 0, UInt64Array, 9000000000); + assert!( + (batch + .column(9) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + - 3.15) + .abs() + < 0.001 + ); + assert!( + (batch + .column(10) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + - 3.15) + .abs() + < 0.001 + ); + assert!( + batch + .column(11) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_primitive_value!(batch, 12, 0, BinaryArray, b"binary"); + assert_primitive_value!( + batch, + 13, + 0, + TimestampMillisecondArray, + now.timestamp_millis() + ); + assert_primitive_value!(batch, 14, 0, Decimal128Array, 9999); - let field_b = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(field_b.is_null(0)); - assert_eq!(field_b.value(1), 2); + let list_array = batch + .column(15) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 3); + let int_array = list_value.as_any().downcast_ref::().unwrap(); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); + + // Verify struct field + let struct_array = batch + .column(16) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!struct_array.is_null(0)); + assert_primitive_value!(struct_array, 0, 0, StringArray, "nested_str"); + assert_primitive_value!(struct_array, 1, 0, Int64Array, 999); + + // Verify map field + let map_array = batch + .column(17) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); + } } -#[test] -fn test_encode_type_mismatches() { - let mut log1 = LogEvent::default(); - log1.insert("field", 42); // Integer - - let mut log2 = LogEvent::default(); - log2.insert("field", 3.15); // Float - type mismatch! +mod edge_cases { + use super::*; - let events = vec![Event::Log(log1), Event::Log(log2)]; + #[test] + fn test_encode_null_values() { + let events = vec![ + create_event(vec![("field_a", 1_i64)]), + create_event(vec![("field_b", 2_i64)]), + ]; - // Schema expects Int64 - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "field", - DataType::Int64, - true, - )])); + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("field_a", DataType::Int64, true), + Field::new("field_b", DataType::Int64, true), + ])); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let batch = encode_and_decode(events, schema).unwrap(); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_rows(), 2); + assert_int64_column(&batch, 0, &[Some(1), None]); + assert_int64_column(&batch, 1, &[None, Some(2)]); + } - let field_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(field_array.value(0), 42); - assert!(field_array.is_null(1)); // Type mismatch becomes null -} + #[test] + fn test_encode_type_mismatches() { + let events = vec![ + create_event(vec![("field", 42_i64)]), + create_event(vec![("field", 3.15_f64)]), // Type mismatch! + ]; -#[test] -fn test_encode_complex_json_values() { - use serde_json::json; + let schema = single_field_schema("field", DataType::Int64, true); + let batch = encode_and_decode(events, schema).unwrap(); - let mut log = LogEvent::default(); - log.insert( - "object_field", - json!({"key": "value", "nested": {"count": 42}}), - ); - log.insert("array_field", json!([1, 2, 3])); + assert_eq!(batch.num_rows(), 2); + // Type mismatch becomes null + assert_int64_column(&batch, 0, &[Some(42), None]); + } - let events = vec![Event::Log(log)]; + #[test] + fn test_encode_empty_arrays_and_maps() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; - let schema = SchemaRef::new(Schema::new(vec![ - Field::new("object_field", DataType::Utf8, true), - Field::new("array_field", DataType::Utf8, true), - ])); + let empty_array = Vec::::new(); + let empty_map = ObjectMap::new(); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let mut log = LogEvent::default(); + log.insert("empty_array", Value::Array(empty_array)); + log.insert("empty_map", Value::Object(empty_map)); + log.insert( + "non_empty_array", + Value::Array(vec![Value::Integer(1), Value::Integer(2)]), + ); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let events = vec![Event::Log(log)]; - assert_eq!(batch.num_rows(), 1); + let array_field = Field::new("item", DataType::Int32, true); + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); - let object_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let object_str = object_array.value(0); - assert!(object_str.contains("key")); - assert!(object_str.contains("value")); - - let array_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - let array_str = array_array.value(0); - assert_eq!(array_str, "[1,2,3]"); -} + let schema = SchemaRef::new(Schema::new(vec![ + Field::new( + "empty_array", + DataType::List(array_field.clone().into()), + true, + ), + Field::new("empty_map", DataType::Map(map_entries.into(), false), true), + Field::new("non_empty_array", DataType::List(array_field.into()), true), + ])); -#[test] -fn test_encode_unsupported_type() { - let mut log = LogEvent::default(); - log.insert("field", "value"); - - let events = vec![Event::Log(log)]; - - // Use an unsupported type - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "field", - DataType::Duration(TimeUnit::Millisecond), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - ArrowEncodingError::UnsupportedType { .. } - )); -} + let batch = encode_and_decode(events, schema).unwrap(); -#[test] -fn test_encode_without_schema_fails() { - let mut log1 = LogEvent::default(); - log1.insert("message", "hello"); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 3); - let events = vec![Event::Log(log1)]; + // Verify empty array + let empty_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!empty_array.is_null(0)); + assert_eq!(empty_array.value(0).len(), 0); - let result = encode_events_to_arrow_ipc_stream(&events, None); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - ArrowEncodingError::NoSchemaProvided - )); -} + // Verify empty map + let empty_map = batch.column(1).as_any().downcast_ref::().unwrap(); + assert!(!empty_map.is_null(0)); + assert_eq!(empty_map.value(0).len(), 0); -#[test] -fn test_encode_empty_events() { - let events: Vec = vec![]; - let result = encode_events_to_arrow_ipc_stream(&events, None); - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), ArrowEncodingError::NoEvents)); + // Verify non-empty array + let non_empty_array = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!non_empty_array.is_null(0)); + assert_eq!(non_empty_array.value(0).len(), 2); + } } -#[test] -fn test_encode_timestamp_precisions() { - let now = Utc::now(); - let mut log = LogEvent::default(); - log.insert("ts_second", now); - log.insert("ts_milli", now); - log.insert("ts_micro", now); - log.insert("ts_nano", now); +mod json_serialization { + use super::*; - let events = vec![Event::Log(log)]; + #[test] + fn test_encode_complex_json_values() { + use serde_json::json; - let schema = SchemaRef::new(Schema::new(vec![ - Field::new( - "ts_second", - DataType::Timestamp(TimeUnit::Second, None), - true, - ), - Field::new( - "ts_milli", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new( - "ts_micro", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - ), - Field::new( - "ts_nano", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - ])); + let mut log = LogEvent::default(); + log.insert( + "object_field", + json!({"key": "value", "nested": {"count": 42}}), + ); + log.insert("array_field", json!([1, 2, 3])); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let events = vec![Event::Log(log)]; - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("object_field", DataType::Utf8, true), + Field::new("array_field", DataType::Utf8, true), + ])); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 4); + let batch = encode_and_decode(events, schema).unwrap(); - let ts_second = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_second.is_null(0)); - assert_eq!(ts_second.value(0), now.timestamp()); + assert_eq!(batch.num_rows(), 1); - let ts_milli = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_milli.is_null(0)); - assert_eq!(ts_milli.value(0), now.timestamp_millis()); - - let ts_micro = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_micro.is_null(0)); - assert_eq!(ts_micro.value(0), now.timestamp_micros()); + let object_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let object_str = object_array.value(0); + assert!(object_str.contains("key")); + assert!(object_str.contains("value")); - let ts_nano = batch - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!ts_nano.is_null(0)); - assert_eq!(ts_nano.value(0), now.timestamp_nanos_opt().unwrap()); + let array_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(array_array.value(0), "[1,2,3]"); + } } -#[test] -fn test_encode_mixed_timestamp_string_and_native() { - // Test mixing string timestamps with native Timestamp values - let mut log1 = LogEvent::default(); - log1.insert("ts", "2025-10-22T10:18:44.256Z"); // String +mod error_handling { + use super::*; - let mut log2 = LogEvent::default(); - log2.insert("ts", Utc::now()); // Native Timestamp + #[test] + fn test_encode_unsupported_type() { + let events = vec![create_event(vec![("field", "value")])]; - let mut log3 = LogEvent::default(); - log3.insert("ts", 1729594724256000000_i64); // Integer (nanoseconds) + let schema = single_field_schema("field", DataType::Duration(TimeUnit::Millisecond), true); - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + ArrowEncodingError::UnsupportedType { .. } + )); + } - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "ts", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - )])); + #[test] + fn test_encode_without_schema_fails() { + let events = vec![create_event(vec![("message", "hello")])]; - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let result = encode_events_to_arrow_ipc_stream(&events, None); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + ArrowEncodingError::NoSchemaProvided + )); + } - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + #[test] + fn test_encode_empty_events() { + let events: Vec = vec![]; + let result = encode_events_to_arrow_ipc_stream(&events, None); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), ArrowEncodingError::NoEvents)); + } +} - assert_eq!(batch.num_rows(), 3); +mod temporal_types { + use super::*; + + #[test] + fn test_encode_timestamp_precisions() { + let now = Utc::now(); + let mut log = LogEvent::default(); + log.insert("ts_second", now); + log.insert("ts_milli", now); + log.insert("ts_micro", now); + log.insert("ts_nano", now); + + let events = vec![Event::Log(log)]; + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new( + "ts_second", + DataType::Timestamp(TimeUnit::Second, None), + true, + ), + Field::new( + "ts_milli", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new( + "ts_micro", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "ts_nano", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + ])); + + let batch = encode_and_decode(events, schema).unwrap(); + + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 4); + + let ts_second = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_second.is_null(0)); + assert_eq!(ts_second.value(0), now.timestamp()); - let ts_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // All three should be non-null - assert!(!ts_array.is_null(0)); - assert!(!ts_array.is_null(1)); - assert!(!ts_array.is_null(2)); - - // First one should match the parsed string - let expected = chrono::DateTime::parse_from_rfc3339("2025-10-22T10:18:44.256Z") - .unwrap() - .timestamp_nanos_opt() - .unwrap(); - assert_eq!(ts_array.value(0), expected); - - // Third one should match the integer - assert_eq!(ts_array.value(2), 1729594724256000000_i64); -} + let ts_milli = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_milli.is_null(0)); + assert_eq!(ts_milli.value(0), now.timestamp_millis()); -#[test] -fn test_encode_invalid_string_timestamp() { - // Test that invalid timestamp strings become null - let mut log1 = LogEvent::default(); - log1.insert("timestamp", "not-a-timestamp"); + let ts_micro = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_micro.is_null(0)); + assert_eq!(ts_micro.value(0), now.timestamp_micros()); - let mut log2 = LogEvent::default(); - log2.insert("timestamp", "2025-10-22T10:18:44.256Z"); // Valid + let ts_nano = batch + .column(3) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!ts_nano.is_null(0)); + assert_eq!(ts_nano.value(0), now.timestamp_nanos_opt().unwrap()); + } - let mut log3 = LogEvent::default(); - log3.insert("timestamp", "2025-99-99T99:99:99Z"); // Invalid + #[test] + fn test_encode_mixed_timestamp_string_and_native() { + // Test mixing string timestamps with native Timestamp values + let mut log1 = LogEvent::default(); + log1.insert("ts", "2025-10-22T10:18:44.256Z"); // String - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + let mut log2 = LogEvent::default(); + log2.insert("ts", Utc::now()); // Native Timestamp - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "timestamp", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - )])); + let mut log3 = LogEvent::default(); + log3.insert("ts", 1729594724256000000_i64); // Integer (nanoseconds) - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); - assert_eq!(batch.num_rows(), 3); + let batch = encode_and_decode(events, schema).unwrap(); - let ts_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); + assert_eq!(batch.num_rows(), 3); - // Invalid timestamps should be null - assert!(ts_array.is_null(0)); - assert!(!ts_array.is_null(1)); // Valid one - assert!(ts_array.is_null(2)); -} + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); -#[test] -fn test_encode_decimal128_from_integer() { - use arrow::array::Decimal128Array; + // All three should be non-null + assert!(!ts_array.is_null(0)); + assert!(!ts_array.is_null(1)); + assert!(!ts_array.is_null(2)); - let mut log = LogEvent::default(); - // Store quantity as integer: 1000 - log.insert("quantity", 1000_i64); + // First one should match the parsed string + let expected = chrono::DateTime::parse_from_rfc3339("2025-10-22T10:18:44.256Z") + .unwrap() + .timestamp_nanos_opt() + .unwrap(); + assert_eq!(ts_array.value(0), expected); - let events = vec![Event::Log(log)]; + // Third one should match the integer + assert_eq!(ts_array.value(2), 1729594724256000000_i64); + } - // Decimal(10, 3) - will represent 1000 as 1000.000 - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "quantity", - DataType::Decimal128(10, 3), - true, - )])); + #[test] + fn test_encode_invalid_string_timestamp() { + // Test that invalid timestamp strings become null + let mut log1 = LogEvent::default(); + log1.insert("timestamp", "not-a-timestamp"); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let mut log2 = LogEvent::default(); + log2.insert("timestamp", "2025-10-22T10:18:44.256Z"); // Valid - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let mut log3 = LogEvent::default(); + log3.insert("timestamp", "2025-99-99T99:99:99Z"); // Invalid - assert_eq!(batch.num_rows(), 1); + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); - assert!(!decimal_array.is_null(0)); - // 1000 with scale 3 = 1000 * 10^3 = 1000000 - assert_eq!(decimal_array.value(0), 1000000_i128); -} + let batch = encode_and_decode(events, schema).unwrap(); -#[test] -fn test_encode_decimal256() { - use arrow::array::Decimal256Array; + assert_eq!(batch.num_rows(), 3); - let mut log = LogEvent::default(); - // Very large precision number - log.insert("big_value", 123456789.123456_f64); + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); - let events = vec![Event::Log(log)]; + // Invalid timestamps should be null + assert!(ts_array.is_null(0)); + assert!(!ts_array.is_null(1)); // Valid one + assert!(ts_array.is_null(2)); + } +} - // Decimal256(50, 6) - high precision decimal - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "big_value", - DataType::Decimal256(50, 6), - true, - )])); +mod decimal_types { + use super::*; - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + #[test] + fn test_encode_decimal128_from_integer() { + use arrow::array::Decimal128Array; - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let mut log = LogEvent::default(); + // Store quantity as integer: 1000 + log.insert("quantity", 1000_i64); - assert_eq!(batch.num_rows(), 1); + let events = vec![Event::Log(log)]; - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); + // Decimal(10, 3) - will represent 1000 as 1000.000 + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "quantity", + DataType::Decimal128(10, 3), + true, + )])); - assert!(!decimal_array.is_null(0)); - // Value should be non-null and encoded - let value = decimal_array.value(0); - assert!(value.to_i128().is_some()); -} + let batch = encode_and_decode(events, schema).unwrap(); -#[test] -fn test_encode_decimal_null_values() { - use arrow::array::Decimal128Array; + assert_eq!(batch.num_rows(), 1); - let mut log1 = LogEvent::default(); - log1.insert("price", 99.99_f64); + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); - let log2 = LogEvent::default(); - // No price field - should be null + assert!(!decimal_array.is_null(0)); + // 1000 with scale 3 = 1000 * 10^3 = 1000000 + assert_eq!(decimal_array.value(0), 1000000_i128); + } - let mut log3 = LogEvent::default(); - log3.insert("price", 50.00_f64); + #[test] + fn test_encode_decimal256() { + use arrow::array::Decimal256Array; - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + let mut log = LogEvent::default(); + // Very large precision number + log.insert("big_value", 123456789.123456_f64); - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "price", - DataType::Decimal128(10, 2), - true, - )])); + let events = vec![Event::Log(log)]; - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + // Decimal256(50, 6) - high precision decimal + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "big_value", + DataType::Decimal256(50, 6), + true, + )])); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let batch = encode_and_decode(events, schema).unwrap(); - assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_rows(), 1); - let decimal_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); - // First row: 99.99 - assert!(!decimal_array.is_null(0)); - assert_eq!(decimal_array.value(0), 9999_i128); + assert!(!decimal_array.is_null(0)); + // Value should be non-null and encoded + let value = decimal_array.value(0); + assert!(value.to_i128().is_some()); + } - // Second row: null - assert!(decimal_array.is_null(1)); + #[test] + fn test_encode_decimal_null_values() { + use arrow::array::Decimal128Array; - // Third row: 50.00 - assert!(!decimal_array.is_null(2)); - assert_eq!(decimal_array.value(2), 5000_i128); -} + let mut log1 = LogEvent::default(); + log1.insert("price", 99.99_f64); -#[test] -fn test_encode_unsigned_integers_with_null_and_overflow() { - use arrow::array::{UInt8Array, UInt32Array}; + let log2 = LogEvent::default(); + // No price field - should be null - let mut log1 = LogEvent::default(); - log1.insert("uint8_field", 100_i64); - log1.insert("uint32_field", 1000_i64); + let mut log3 = LogEvent::default(); + log3.insert("price", 50.00_f64); - let mut log2 = LogEvent::default(); - log2.insert("uint8_field", 300_i64); // Overflow - should be null - log2.insert("uint32_field", -1_i64); // Negative - should be null + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let log3 = LogEvent::default(); - // Missing fields - should be null + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "price", + DataType::Decimal128(10, 2), + true, + )])); - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + let batch = encode_and_decode(events, schema).unwrap(); - let schema = SchemaRef::new(Schema::new(vec![ - Field::new("uint8_field", DataType::UInt8, true), - Field::new("uint32_field", DataType::UInt32, true), - ])); + assert_eq!(batch.num_rows(), 3); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + let decimal_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + // First row: 99.99 + assert!(!decimal_array.is_null(0)); + assert_eq!(decimal_array.value(0), 9999_i128); - assert_eq!(batch.num_rows(), 3); + // Second row: null + assert!(decimal_array.is_null(1)); - // Check uint8 column - let uint8_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint8_array.value(0), 100_u8); // Valid - assert!(uint8_array.is_null(1)); // Overflow - assert!(uint8_array.is_null(2)); // Missing - - // Check uint32 column - let uint32_array = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(uint32_array.value(0), 1000_u32); // Valid - assert!(uint32_array.is_null(1)); // Negative - assert!(uint32_array.is_null(2)); // Missing + // Third row: 50.00 + assert!(!decimal_array.is_null(2)); + assert_eq!(decimal_array.value(2), 5000_i128); + } } -#[test] -fn test_encode_non_nullable_field_with_null_value() { - // Test that encoding fails when a non-nullable field encounters a null value - let mut log1 = LogEvent::default(); - log1.insert("required_field", 42); +mod primitive_types { + use super::*; - let log2 = LogEvent::default(); - // log2 is missing required_field - should cause an error + #[test] + fn test_encode_unsigned_integers_with_null_and_overflow() { + use arrow::array::{UInt8Array, UInt32Array}; - let events = vec![Event::Log(log1), Event::Log(log2)]; + let mut log1 = LogEvent::default(); + log1.insert("uint8_field", 100_i64); + log1.insert("uint32_field", 1000_i64); - // Create schema with non-nullable field - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "required_field", - DataType::Int64, - false, // Not nullable - )])); + let mut log2 = LogEvent::default(); + log2.insert("uint8_field", 300_i64); // Overflow - should be null + log2.insert("uint32_field", -1_i64); // Negative - should be null - let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - assert!(result.is_err()); - - match result.unwrap_err() { - ArrowEncodingError::NullConstraint { field_name } => { - assert_eq!(field_name, "required_field"); - } - other => panic!("Expected NullConstraint error, got: {:?}", other), - } -} + let log3 = LogEvent::default(); + // Missing fields - should be null -#[test] -fn test_encode_non_nullable_field_all_values_present() { - // Test that encoding succeeds when all values are present for non-nullable field - let mut log1 = LogEvent::default(); - log1.insert("id", 1); + let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; - let mut log2 = LogEvent::default(); - log2.insert("id", 2); + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("uint8_field", DataType::UInt8, true), + Field::new("uint32_field", DataType::UInt32, true), + ])); - let mut log3 = LogEvent::default(); - log3.insert("id", 3); + let batch = encode_and_decode(events, schema).unwrap(); - let events = vec![Event::Log(log1), Event::Log(log2), Event::Log(log3)]; + assert_eq!(batch.num_rows(), 3); - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "id", - DataType::Int64, - false, // Not nullable - )])); + // Check uint8 column + let uint8_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint8_array.value(0), 100_u8); // Valid + assert!(uint8_array.is_null(1)); // Overflow + assert!(uint8_array.is_null(2)); // Missing - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!(result.is_ok()); + // Check uint32 column + let uint32_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(uint32_array.value(0), 1000_u32); // Valid + assert!(uint32_array.is_null(1)); // Negative + assert!(uint32_array.is_null(2)); // Missing + } - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + #[test] + fn test_encode_non_nullable_field_with_null_value() { + let events = vec![ + create_event(vec![("required_field", 42_i64)]), + LogEvent::default().into(), // Missing required field + ]; - assert_eq!(batch.num_rows(), 3); + let schema = single_field_schema("required_field", DataType::Int64, false); + let result = encode_events_to_arrow_ipc_stream(&events, Some(schema)); - let id_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(id_array.value(0), 1); - assert_eq!(id_array.value(1), 2); - assert_eq!(id_array.value(2), 3); - assert!(!id_array.is_null(0)); - assert!(!id_array.is_null(1)); - assert!(!id_array.is_null(2)); -} + assert!(result.is_err()); + match result.unwrap_err() { + ArrowEncodingError::NullConstraint { field_name } => { + assert_eq!(field_name, "required_field"); + } + other => panic!("Expected NullConstraint error, got: {:?}", other), + } + } -#[test] -fn test_config_allow_nullable_fields_overrides_schema() { - use tokio_util::codec::Encoder; + #[test] + fn test_encode_non_nullable_field_all_values_present() { + let events = vec![ + create_event(vec![("id", 1_i64)]), + create_event(vec![("id", 2_i64)]), + create_event(vec![("id", 3_i64)]), + ]; - // Create events: One valid, one missing the "required" field - let mut log1 = LogEvent::default(); - log1.insert("strict_field", 42); - let log2 = LogEvent::default(); - let events = vec![Event::Log(log1), Event::Log(log2)]; + let schema = single_field_schema("id", DataType::Int64, false); + let batch = encode_and_decode(events, schema).unwrap(); - let schema = Schema::new(vec![Field::new("strict_field", DataType::Int64, false)]); + assert_eq!(batch.num_rows(), 3); + assert_int64_column(&batch, 0, &[Some(1), Some(2), Some(3)]); + } +} - let mut config = ArrowStreamSerializerConfig::new(schema); - config.allow_nullable_fields = true; +mod config_tests { + use super::*; + use tokio_util::codec::Encoder; - let mut serializer = ArrowStreamSerializer::new(config).expect("Failed to create serializer"); + #[test] + fn test_config_allow_nullable_fields_overrides_schema() { + let mut log1 = LogEvent::default(); + log1.insert("strict_field", 42); + let log2 = LogEvent::default(); + let events = vec![Event::Log(log1), Event::Log(log2)]; - let mut buffer = BytesMut::new(); - serializer - .encode(events, &mut buffer) - .expect("Encoding should succeed when allow_nullable_fields is true"); + let schema = Schema::new(vec![Field::new("strict_field", DataType::Int64, false)]); - let cursor = Cursor::new(buffer); - let mut reader = StreamReader::try_new(cursor, None).expect("Failed to create reader"); - let batch = reader.next().unwrap().expect("Failed to read batch"); + let mut config = ArrowStreamSerializerConfig::new(schema); + config.allow_nullable_fields = true; - assert_eq!(batch.num_rows(), 2); + let mut serializer = + ArrowStreamSerializer::new(config).expect("Failed to create serializer"); - let binding = batch.schema(); - let output_field = binding.field(0); - assert!( - output_field.is_nullable(), - "The output schema field should have been transformed to nullable=true" - ); + let mut buffer = BytesMut::new(); + serializer + .encode(events, &mut buffer) + .expect("Encoding should succeed when allow_nullable_fields is true"); - let array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); + let cursor = Cursor::new(buffer); + let mut reader = StreamReader::try_new(cursor, None).expect("Failed to create reader"); + let batch = reader.next().unwrap().expect("Failed to read batch"); - assert_eq!(array.value(0), 42); - assert!(!array.is_null(0)); - assert!( - array.is_null(1), - "The missing value should be encoded as null" - ); -} + assert_eq!(batch.num_rows(), 2); -#[test] -fn test_make_field_nullable_with_nested_types() { - // Test that make_field_nullable recursively handles List and Struct types + let binding = batch.schema(); + let output_field = binding.field(0); + assert!( + output_field.is_nullable(), + "The output schema field should have been transformed to nullable=true" + ); - // Create a nested structure: Struct containing a List of Structs - // struct { inner_list: [{ nested_field: Int64 }] } - let inner_struct_field = Field::new("nested_field", DataType::Int64, false); - let inner_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![inner_struct_field])); - let list_field = Field::new("item", inner_struct, false); - let list_type = DataType::List(list_field.into()); - let outer_field = Field::new("inner_list", list_type, false); - let outer_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![outer_field])); + let array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); - let original_field = Field::new("root", outer_struct, false); + assert_eq!(array.value(0), 42); + assert!(!array.is_null(0)); + assert!( + array.is_null(1), + "The missing value should be encoded as null" + ); + } - // Apply make_field_nullable - let nullable_field = make_field_nullable(&original_field); + #[test] + fn test_make_field_nullable_with_nested_types() { + let inner_struct_field = Field::new("nested_field", DataType::Int64, false); + let inner_struct = + DataType::Struct(arrow::datatypes::Fields::from(vec![inner_struct_field])); + let list_field = Field::new("item", inner_struct, false); + let list_type = DataType::List(list_field.into()); + let outer_field = Field::new("inner_list", list_type, false); + let outer_struct = DataType::Struct(arrow::datatypes::Fields::from(vec![outer_field])); - // Verify root field is nullable - assert!( - nullable_field.is_nullable(), - "Root field should be nullable" - ); + let original_field = Field::new("root", outer_struct, false); + let nullable_field = make_field_nullable(&original_field); - // Verify nested struct is nullable - if let DataType::Struct(root_fields) = nullable_field.data_type() { - let inner_list_field = &root_fields[0]; assert!( - inner_list_field.is_nullable(), - "inner_list field should be nullable" + nullable_field.is_nullable(), + "Root field should be nullable" ); - // Verify list element is nullable - if let DataType::List(list_item_field) = inner_list_field.data_type() { - assert!( - list_item_field.is_nullable(), - "List item field should be nullable" - ); + if let DataType::Struct(root_fields) = nullable_field.data_type() { + let inner_list_field = &root_fields[0]; + assert!(inner_list_field.is_nullable()); - // Verify inner struct fields are nullable - if let DataType::Struct(inner_struct_fields) = list_item_field.data_type() { - let nested_field = &inner_struct_fields[0]; - assert!( - nested_field.is_nullable(), - "nested_field should be nullable" - ); + if let DataType::List(list_item_field) = inner_list_field.data_type() { + assert!(list_item_field.is_nullable()); + + if let DataType::Struct(inner_struct_fields) = list_item_field.data_type() { + let nested_field = &inner_struct_fields[0]; + assert!(nested_field.is_nullable()); + } else { + panic!("Expected Struct type for list items"); + } } else { - panic!("Expected Struct type for list items"); + panic!("Expected List type for inner_list"); } } else { - panic!("Expected List type for inner_list"); + panic!("Expected Struct type for root field"); } - } else { - panic!("Expected Struct type for root field"); } -} -#[test] -fn test_make_field_nullable_with_map_type() { - // Test that make_field_nullable handles Map types - // Map is internally represented as List> - - // Create a map: Map - // Internally: List> - let key_field = Field::new("key", DataType::Utf8, false); - let value_field = Field::new("value", DataType::Int64, false); - let entries_struct = - DataType::Struct(arrow::datatypes::Fields::from(vec![key_field, value_field])); - let entries_field = Field::new("entries", entries_struct, false); - let map_type = DataType::Map(entries_field.into(), false); - - let original_field = Field::new("my_map", map_type, false); - - // Apply make_field_nullable - let nullable_field = make_field_nullable(&original_field); - - // Verify root field is nullable - assert!( - nullable_field.is_nullable(), - "Root map field should be nullable" - ); + #[test] + fn test_make_field_nullable_with_map_type() { + let key_field = Field::new("key", DataType::Utf8, false); + let value_field = Field::new("value", DataType::Int64, false); + let entries_struct = + DataType::Struct(arrow::datatypes::Fields::from(vec![key_field, value_field])); + let entries_field = Field::new("entries", entries_struct, false); + let map_type = DataType::Map(entries_field.into(), false); + + let original_field = Field::new("my_map", map_type, false); + let nullable_field = make_field_nullable(&original_field); - // Verify map entries nullability matches MapBuilder behavior - if let DataType::Map(entries_field, _sorted) = nullable_field.data_type() { - // MapBuilder creates entries struct as non-nullable assert!( - !entries_field.is_nullable(), - "Map entries field should be non-nullable to match MapBuilder" + nullable_field.is_nullable(), + "Root map field should be nullable" ); - // Verify the struct inside the map - if let DataType::Struct(struct_fields) = entries_field.data_type() { - let key_field = &struct_fields[0]; - let value_field = &struct_fields[1]; - // MapBuilder keeps keys as non-nullable - assert!( - !key_field.is_nullable(), - "Map key field should be non-nullable to match MapBuilder" - ); - // But values field should be transformed to nullable + if let DataType::Map(entries_field, _sorted) = nullable_field.data_type() { assert!( - value_field.is_nullable(), - "Map value field should be nullable" + !entries_field.is_nullable(), + "Map entries field should be non-nullable" ); + + if let DataType::Struct(struct_fields) = entries_field.data_type() { + let key_field = &struct_fields[0]; + let value_field = &struct_fields[1]; + assert!( + !key_field.is_nullable(), + "Map key field should be non-nullable" + ); + assert!( + value_field.is_nullable(), + "Map value field should be nullable" + ); + } else { + panic!("Expected Struct type for map entries"); + } } else { - panic!("Expected Struct type for map entries"); + panic!("Expected Map type for my_map field"); } - } else { - panic!("Expected Map type for my_map field"); } } -#[test] -fn test_encode_nested_maps() { - use arrow::array::MapArray; - use vrl::value::ObjectMap; - - // Create nested map: Map> - // {"outer_key1": {"inner_key1": 100, "inner_key2": 200}, "outer_key2": {"inner_key3": 300}} - let mut inner_map1 = ObjectMap::new(); - inner_map1.insert("inner_key1".into(), Value::Integer(100)); - inner_map1.insert("inner_key2".into(), Value::Integer(200)); - - let mut inner_map2 = ObjectMap::new(); - inner_map2.insert("inner_key3".into(), Value::Integer(300)); - - let mut outer_map = ObjectMap::new(); - outer_map.insert("outer_key1".into(), Value::Object(inner_map1)); - outer_map.insert("outer_key2".into(), Value::Object(inner_map2)); - - let mut log = LogEvent::default(); - log.insert("nested_map", Value::Object(outer_map)); - - let events = vec![Event::Log(log)]; - - // Define schema: Map> - // Note: MapBuilder uses "keys" and "values" (plural) as field names - let inner_map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Int32, true), - ])), - false, - ); - let inner_map_type = DataType::Map(inner_map_entries.into(), false); - - let outer_map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", inner_map_type, true), - ])), - false, - ); - let outer_map_type = DataType::Map(outer_map_entries.into(), false); - - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "nested_map", - outer_map_type, - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode nested maps: {:?}", - result.as_ref().err() - ); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); +mod nested_types { + use super::*; + + #[test] + fn test_encode_nested_maps() { + use arrow::array::MapArray; + use vrl::value::ObjectMap; + + // Create nested map: Map> + // {"outer_key1": {"inner_key1": 100, "inner_key2": 200}, "outer_key2": {"inner_key3": 300}} + let mut inner_map1 = ObjectMap::new(); + inner_map1.insert("inner_key1".into(), Value::Integer(100)); + inner_map1.insert("inner_key2".into(), Value::Integer(200)); + + let mut inner_map2 = ObjectMap::new(); + inner_map2.insert("inner_key3".into(), Value::Integer(300)); + + let mut outer_map = ObjectMap::new(); + outer_map.insert("outer_key1".into(), Value::Object(inner_map1)); + outer_map.insert("outer_key2".into(), Value::Object(inner_map2)); + + let mut log = LogEvent::default(); + log.insert("nested_map", Value::Object(outer_map)); + + let events = vec![Event::Log(log)]; + + // Define schema: Map> + // Note: MapBuilder uses "keys" and "values" (plural) as field names + let inner_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + let inner_map_type = DataType::Map(inner_map_entries.into(), false); + + let outer_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", inner_map_type, true), + ])), + false, + ); + let outer_map_type = DataType::Map(outer_map_entries.into(), false); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "nested_map", + outer_map_type, + true, + )])); - // Verify the outer map exists - let outer_map_array = batch.column(0).as_any().downcast_ref::().unwrap(); - assert_eq!(outer_map_array.len(), 1); - assert!(!outer_map_array.is_null(0), "Outer map should not be null"); + let batch = encode_and_decode(events, schema).expect("Failed to encode nested maps"); - // Get the outer map's values (which are inner maps) - let outer_map_value = outer_map_array.value(0); - assert_eq!(outer_map_value.len(), 2, "Outer map should have 2 entries"); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - // The outer map's values are themselves a MapArray - let inner_maps = outer_map_array.values(); - let inner_maps_array = inner_maps.as_any().downcast_ref::().unwrap(); + // Verify the outer map exists + let outer_map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!(outer_map_array.len(), 1); + assert!(!outer_map_array.is_null(0), "Outer map should not be null"); - // Verify we have 2 inner maps (one for each outer key) - // Total entries across both inner maps: 2 + 1 = 3 - assert_eq!(inner_maps_array.len(), 2, "Should have 2 inner maps"); + // Get the outer map's values (which are inner maps) + let outer_map_value = outer_map_array.value(0); + assert_eq!(outer_map_value.len(), 2, "Outer map should have 2 entries"); - // Verify first inner map has 2 entries - let first_inner_map = inner_maps_array.value(0); - assert_eq!( - first_inner_map.len(), - 2, - "First inner map should have 2 entries" - ); + // The outer map's values are themselves a MapArray + let inner_maps = outer_map_array.values(); + let inner_maps_array = inner_maps.as_any().downcast_ref::().unwrap(); - // Verify second inner map has 1 entry - let second_inner_map = inner_maps_array.value(1); - assert_eq!( - second_inner_map.len(), - 1, - "Second inner map should have 1 entry" - ); -} + // Verify we have 2 inner maps (one for each outer key) + // Total entries across both inner maps: 2 + 1 = 3 + assert_eq!(inner_maps_array.len(), 2, "Should have 2 inner maps"); -#[test] -fn test_encode_array_of_maps() { - use arrow::array::ListArray; - use vrl::value::ObjectMap; + // Verify first inner map has 2 entries + let first_inner_map = inner_maps_array.value(0); + assert_eq!( + first_inner_map.len(), + 2, + "First inner map should have 2 entries" + ); - // Create array of maps: Array> - // [{"key1": 100, "key2": 200}, {"key3": 300}] - let mut map1 = ObjectMap::new(); - map1.insert("key1".into(), Value::Integer(100)); - map1.insert("key2".into(), Value::Integer(200)); + // Verify second inner map has 1 entry + let second_inner_map = inner_maps_array.value(1); + assert_eq!( + second_inner_map.len(), + 1, + "Second inner map should have 1 entry" + ); + } - let mut map2 = ObjectMap::new(); - map2.insert("key3".into(), Value::Integer(300)); + #[test] + fn test_encode_array_of_maps() { + use arrow::array::ListArray; + use vrl::value::ObjectMap; - let array_of_maps = Value::Array(vec![Value::Object(map1), Value::Object(map2)]); + // Create array of maps: Array> + // [{"key1": 100, "key2": 200}, {"key3": 300}] + let mut map1 = ObjectMap::new(); + map1.insert("key1".into(), Value::Integer(100)); + map1.insert("key2".into(), Value::Integer(200)); - let mut log = LogEvent::default(); - log.insert("array_of_maps", array_of_maps); - - let events = vec![Event::Log(log)]; - - // Define schema: List> - // Note: MapBuilder uses "keys" and "values" (plural) as field names - let map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Int32, true), - ])), - false, - ); - let map_type = DataType::Map(map_entries.into(), false); - let list_field = Field::new("item", map_type, true); - - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "array_of_maps", - DataType::List(list_field.into()), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode array of maps: {:?}", - result.as_ref().err() - ); + let mut map2 = ObjectMap::new(); + map2.insert("key3".into(), Value::Integer(300)); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let array_of_maps = Value::Array(vec![Value::Object(map1), Value::Object(map2)]); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + let mut log = LogEvent::default(); + log.insert("array_of_maps", array_of_maps); - // Verify the array exists - let list_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!list_array.is_null(0), "Array should not be null"); - assert_eq!(list_array.value(0).len(), 2, "Array should have 2 maps"); - - // Verify the maps inside the array - let maps = list_array.value(0); - let map_array = maps - .as_any() - .downcast_ref::() - .unwrap(); + let events = vec![Event::Log(log)]; - // First map should have 2 entries - let first_map = map_array.value(0); - assert_eq!(first_map.len(), 2, "First map should have 2 entries"); + // Define schema: List> + // Note: MapBuilder uses "keys" and "values" (plural) as field names + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); + let map_type = DataType::Map(map_entries.into(), false); + let list_field = Field::new("item", map_type, true); - // Second map should have 1 entry - let second_map = map_array.value(1); - assert_eq!(second_map.len(), 1, "Second map should have 1 entry"); -} + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "array_of_maps", + DataType::List(list_field.into()), + true, + )])); -#[test] -fn test_encode_array_of_structs() { - use arrow::array::ListArray; - use vrl::value::ObjectMap; + let batch = encode_and_decode(events, schema).expect("Failed to encode array of maps"); - // Create array of structs (tuples): Array - // [{"f0": "value1", "f1": 100}, {"f0": "value2", "f1": 200}] - let mut tuple1 = ObjectMap::new(); - tuple1.insert("f0".into(), Value::Bytes("value1".into())); - tuple1.insert("f1".into(), Value::Integer(100)); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - let mut tuple2 = ObjectMap::new(); - tuple2.insert("f0".into(), Value::Bytes("value2".into())); - tuple2.insert("f1".into(), Value::Integer(200)); + // Verify the array exists + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0), "Array should not be null"); + assert_eq!(list_array.value(0).len(), 2, "Array should have 2 maps"); - let array_of_structs = Value::Array(vec![Value::Object(tuple1), Value::Object(tuple2)]); + // Verify the maps inside the array + let maps = list_array.value(0); + let map_array = maps + .as_any() + .downcast_ref::() + .unwrap(); - let mut log = LogEvent::default(); - log.insert("array_of_structs", array_of_structs); - - let events = vec![Event::Log(log)]; - - // Define schema: List - let struct_fields = arrow::datatypes::Fields::from(vec![ - Field::new("f0", DataType::Utf8, true), - Field::new("f1", DataType::Int32, true), - ]); - let struct_type = DataType::Struct(struct_fields); - let list_field = Field::new("item", struct_type, true); - - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "array_of_structs", - DataType::List(list_field.into()), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode array of structs: {:?}", - result.as_ref().err() - ); + // First map should have 2 entries + let first_map = map_array.value(0); + assert_eq!(first_map.len(), 2, "First map should have 2 entries"); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + // Second map should have 1 entry + let second_map = map_array.value(1); + assert_eq!(second_map.len(), 1, "Second map should have 1 entry"); + } - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + #[test] + fn test_encode_array_of_structs() { + use arrow::array::ListArray; + use vrl::value::ObjectMap; - // Verify the array exists and has the correct number of elements - let list_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!list_array.is_null(0), "Array should not be null"); - assert_eq!(list_array.value(0).len(), 2, "Array should have 2 structs"); - - // Verify the structs inside the array - let struct_array = list_array.value(0); - let struct_array = struct_array - .as_any() - .downcast_ref::() - .unwrap(); + // Create array of structs (tuples): Array + // [{"f0": "value1", "f1": 100}, {"f0": "value2", "f1": 200}] + let mut tuple1 = ObjectMap::new(); + tuple1.insert("f0".into(), Value::Bytes("value1".into())); + tuple1.insert("f1".into(), Value::Integer(100)); - // Check first struct field (f0 - strings) - let f0_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f0_array.value(0), "value1"); - assert_eq!(f0_array.value(1), "value2"); - - // Check second struct field (f1 - integers) - let f1_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(f1_array.value(0), 100); - assert_eq!(f1_array.value(1), 200); -} + let mut tuple2 = ObjectMap::new(); + tuple2.insert("f0".into(), Value::Bytes("value2".into())); + tuple2.insert("f1".into(), Value::Integer(200)); -#[test] -fn test_encode_empty_arrays_and_maps() { - use arrow::array::{ListArray, MapArray}; - use vrl::value::ObjectMap; + let array_of_structs = Value::Array(vec![Value::Object(tuple1), Value::Object(tuple2)]); - // Create log with empty array and empty map - let empty_array = Vec::::new(); - let empty_map = ObjectMap::new(); + let mut log = LogEvent::default(); + log.insert("array_of_structs", array_of_structs); - let mut log = LogEvent::default(); - log.insert("empty_array", Value::Array(empty_array)); - log.insert("empty_map", Value::Object(empty_map)); - log.insert( - "non_empty_array", - Value::Array(vec![Value::Integer(1), Value::Integer(2)]), - ); + let events = vec![Event::Log(log)]; - let events = vec![Event::Log(log)]; - - // Define schema - // Note: MapBuilder uses "keys" and "values" (plural) as field names - let array_field = Field::new("item", DataType::Int32, true); - let map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Int32, true), - ])), - false, - ); + // Define schema: List + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int32, true), + ]); + let struct_type = DataType::Struct(struct_fields); + let list_field = Field::new("item", struct_type, true); - let schema = SchemaRef::new(Schema::new(vec![ - Field::new( - "empty_array", - DataType::List(array_field.clone().into()), + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "array_of_structs", + DataType::List(list_field.into()), true, - ), - Field::new("empty_map", DataType::Map(map_entries.into(), false), true), - Field::new("non_empty_array", DataType::List(array_field.into()), true), - ])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode empty collections: {:?}", - result.as_ref().err() - ); - - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 3); - - // Verify empty array - let empty_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!empty_array.is_null(0), "Empty array should not be null"); - assert_eq!(empty_array.value(0).len(), 0, "Array should be empty"); - - // Verify empty map - let empty_map = batch.column(1).as_any().downcast_ref::().unwrap(); - assert!(!empty_map.is_null(0), "Empty map should not be null"); - assert_eq!(empty_map.value(0).len(), 0, "Map should be empty"); - - // Verify non-empty array - let non_empty_array = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!non_empty_array.is_null(0)); - assert_eq!(non_empty_array.value(0).len(), 2); -} + )])); -#[test] -fn test_encode_deep_nesting() { - use arrow::array::ListArray; + let batch = encode_and_decode(events, schema).expect("Failed to encode array of structs"); - // Create deeply nested array structure (6 levels): - // Array -> Array -> Array -> Array -> Array -> Int32 - let level_5 = Value::Array(vec![Value::Integer(42), Value::Integer(99)]); - let level_4 = Value::Array(vec![level_5]); - let level_3 = Value::Array(vec![level_4]); - let level_2 = Value::Array(vec![level_3]); - let level_1 = Value::Array(vec![level_2]); - - let mut log = LogEvent::default(); - log.insert("deep_array", level_1); - - let events = vec![Event::Log(log)]; - - // Define schema for deep array nesting (6 levels total) - let mut current_field = Field::new("item", DataType::Int32, true); - for _ in 0..5 { - current_field = Field::new("item", DataType::List(current_field.into()), true); - } - - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "deep_array", - current_field.data_type().clone(), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode deeply nested arrays: {:?}", - result.as_ref().err() - ); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + // Verify the array exists and has the correct number of elements + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0), "Array should not be null"); + assert_eq!(list_array.value(0).len(), 2, "Array should have 2 structs"); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + // Verify the structs inside the array + let struct_array = list_array.value(0); + let struct_array = struct_array + .as_any() + .downcast_ref::() + .unwrap(); - // Verify deep array by navigating down through all levels - // Store intermediate arrays to avoid lifetime issues - let mut arrays: Vec = Vec::new(); - arrays.push(batch.column(0).clone()); + // Check first struct field (f0 - strings) + let f0_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(f0_array.value(0), "value1"); + assert_eq!(f0_array.value(1), "value2"); - // Navigate through 5 nested List levels - for level in 0..5 { - let list_array = arrays[level] + // Check second struct field (f1 - integers) + let f1_array = struct_array + .column(1) .as_any() - .downcast_ref::() - .unwrap_or_else(|| panic!("Expected ListArray at level {}", level)); - assert!( - !list_array.is_null(0), - "Array should not be null at level {}", - level - ); - assert_eq!( - list_array.len(), - 1, - "Array should have 1 element at level {}", - level - ); - arrays.push(list_array.value(0)); + .downcast_ref::() + .unwrap(); + assert_eq!(f1_array.value(0), 100); + assert_eq!(f1_array.value(1), 200); } - // Final level (level 5) should be Int32Array with values [42, 99] - let int_array = arrays[5] - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(int_array.len(), 2, "Final array should have 2 elements"); - assert_eq!(int_array.value(0), 42); - assert_eq!(int_array.value(1), 99); -} + #[test] + fn test_encode_deep_nesting() { + use arrow::array::ListArray; -#[test] -fn test_encode_struct_with_list_and_map() { - use arrow::array::{ListArray, MapArray}; - use vrl::value::ObjectMap; - - // Create a struct containing both a list and a map - // Struct { list_field: [1, 2, 3], map_field: {"k1": 10, "k2": 20} } - let mut struct_value = ObjectMap::new(); - struct_value.insert( - "f0".into(), - Value::Array(vec![ - Value::Integer(1), - Value::Integer(2), - Value::Integer(3), - ]), - ); + // Create deeply nested array structure (6 levels): + // Array -> Array -> Array -> Array -> Array -> Int32 + let level_5 = Value::Array(vec![Value::Integer(42), Value::Integer(99)]); + let level_4 = Value::Array(vec![level_5]); + let level_3 = Value::Array(vec![level_4]); + let level_2 = Value::Array(vec![level_3]); + let level_1 = Value::Array(vec![level_2]); - let mut map_value = ObjectMap::new(); - map_value.insert("k1".into(), Value::Integer(10)); - map_value.insert("k2".into(), Value::Integer(20)); - struct_value.insert("f1".into(), Value::Object(map_value)); + let mut log = LogEvent::default(); + log.insert("deep_array", level_1); - let mut log = LogEvent::default(); - log.insert("complex_struct", Value::Object(struct_value)); - - let events = vec![Event::Log(log)]; - - // Define schema: Struct { list_field: List, map_field: Map } - let map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Int32, true), - ])), - false, - ); + let events = vec![Event::Log(log)]; - let struct_fields = arrow::datatypes::Fields::from(vec![ - Field::new( - "f0", - DataType::List(Field::new("item", DataType::Int32, true).into()), + // Define schema for deep array nesting (6 levels total) + let mut current_field = Field::new("item", DataType::Int32, true); + for _ in 0..5 { + current_field = Field::new("item", DataType::List(current_field.into()), true); + } + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "deep_array", + current_field.data_type().clone(), true, - ), - Field::new("f1", DataType::Map(map_entries.into(), false), true), - ]); - - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "complex_struct", - DataType::Struct(struct_fields), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode struct with list and map: {:?}", - result.err() - ); + )])); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let batch = + encode_and_decode(events, schema).expect("Failed to encode deeply nested arrays"); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - // Verify the struct - let struct_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!struct_array.is_null(0)); + // Verify deep array by navigating down through all levels + // Store intermediate arrays to avoid lifetime issues + let mut arrays: Vec = Vec::new(); + arrays.push(batch.column(0).clone()); - // Verify the list inside the struct (f0) - let list_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!list_array.is_null(0)); - let list_value = list_array.value(0); - assert_eq!(list_value.len(), 3); - let int_array = list_value - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(int_array.value(0), 1); - assert_eq!(int_array.value(1), 2); - assert_eq!(int_array.value(2), 3); - - // Verify the map inside the struct (f1) - let map_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!map_array.is_null(0)); - let map_value = map_array.value(0); - assert_eq!(map_value.len(), 2); -} + // Navigate through 5 nested List levels + for level in 0..5 { + let list_array = arrays[level] + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("Expected ListArray at level {}", level)); + assert!( + !list_array.is_null(0), + "Array should not be null at level {}", + level + ); + assert_eq!( + list_array.len(), + 1, + "Array should have 1 element at level {}", + level + ); + arrays.push(list_array.value(0)); + } -#[test] -fn test_encode_map_with_struct_values() { - use arrow::array::MapArray; - use vrl::value::ObjectMap; + // Final level (level 5) should be Int32Array with values [42, 99] + let int_array = arrays[5] + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_array.len(), 2, "Final array should have 2 elements"); + assert_eq!(int_array.value(0), 42); + assert_eq!(int_array.value(1), 99); + } - // Create a map where values are structs - // Map - // {"item1": {"f0": "Alice", "f1": 10}, "item2": {"f0": "Bob", "f1": 20}} - let mut struct1 = ObjectMap::new(); - struct1.insert("f0".into(), Value::Bytes("Alice".into())); - struct1.insert("f1".into(), Value::Integer(10)); + #[test] + fn test_encode_struct_with_list_and_map() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a struct containing both a list and a map + // Struct { list_field: [1, 2, 3], map_field: {"k1": 10, "k2": 20} } + let mut struct_value = ObjectMap::new(); + struct_value.insert( + "f0".into(), + Value::Array(vec![ + Value::Integer(1), + Value::Integer(2), + Value::Integer(3), + ]), + ); - let mut struct2 = ObjectMap::new(); - struct2.insert("f0".into(), Value::Bytes("Bob".into())); - struct2.insert("f1".into(), Value::Integer(20)); + let mut map_value = ObjectMap::new(); + map_value.insert("k1".into(), Value::Integer(10)); + map_value.insert("k2".into(), Value::Integer(20)); + struct_value.insert("f1".into(), Value::Object(map_value)); - let mut map_value = ObjectMap::new(); - map_value.insert("item1".into(), Value::Object(struct1)); - map_value.insert("item2".into(), Value::Object(struct2)); + let mut log = LogEvent::default(); + log.insert("complex_struct", Value::Object(struct_value)); - let mut log = LogEvent::default(); - log.insert("map_with_structs", Value::Object(map_value)); - - let events = vec![Event::Log(log)]; - - // Define schema: Map - let struct_fields = arrow::datatypes::Fields::from(vec![ - Field::new("f0", DataType::Utf8, true), - Field::new("f1", DataType::Int32, true), - ]); - - let map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Struct(struct_fields), true), - ])), - false, - ); + let events = vec![Event::Log(log)]; - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "map_with_structs", - DataType::Map(map_entries.into(), false), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode map with struct values: {:?}", - result.err() - ); + // Define schema: Struct { list_field: List, map_field: Map } + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ])), + false, + ); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new( + "f0", + DataType::List(Field::new("item", DataType::Int32, true).into()), + true, + ), + Field::new("f1", DataType::Map(map_entries.into(), false), true), + ]); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "complex_struct", + DataType::Struct(struct_fields), + true, + )])); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + let batch = + encode_and_decode(events, schema).expect("Failed to encode struct with list and map"); - // Verify the map - let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); - assert!(!map_array.is_null(0)); - let map_value = map_array.value(0); - assert_eq!(map_value.len(), 2); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - // Verify the struct values in the map - let struct_array = map_array - .values() - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(struct_array.len(), 2); + // Verify the struct + let struct_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!struct_array.is_null(0)); - // Check f0 field (names) - let names_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let name1 = names_array.value(0); - let name2 = names_array.value(1); - assert!(name1 == "Alice" || name1 == "Bob"); - assert!(name2 == "Alice" || name2 == "Bob"); - assert_ne!(name1, name2); - - // Check f1 field (counts) - let counts_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(counts_array.value(0) == 10 || counts_array.value(0) == 20); - assert!(counts_array.value(1) == 10 || counts_array.value(1) == 20); -} + // Verify the list inside the struct (f0) + let list_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 3); + let int_array = list_value + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_array.value(0), 1); + assert_eq!(int_array.value(1), 2); + assert_eq!(int_array.value(2), 3); -#[test] -fn test_encode_list_of_structs_containing_maps() { - use arrow::array::{ListArray, MapArray}; - use vrl::value::ObjectMap; + // Verify the map inside the struct (f1) + let map_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); + } - // Create a list of structs, where each struct contains a map - // List }> - // [ - // {"f0": 1, "f1": {"color": "red", "size": "large"}}, - // {"f0": 2, "f1": {"color": "blue", "size": "small"}} - // ] - let mut attrs1 = ObjectMap::new(); - attrs1.insert("color".into(), Value::Bytes("red".into())); - attrs1.insert("size".into(), Value::Bytes("large".into())); + #[test] + fn test_encode_map_with_struct_values() { + use arrow::array::MapArray; + use vrl::value::ObjectMap; + + // Create a map where values are structs + // Map + // {"item1": {"f0": "Alice", "f1": 10}, "item2": {"f0": "Bob", "f1": 20}} + let mut struct1 = ObjectMap::new(); + struct1.insert("f0".into(), Value::Bytes("Alice".into())); + struct1.insert("f1".into(), Value::Integer(10)); + + let mut struct2 = ObjectMap::new(); + struct2.insert("f0".into(), Value::Bytes("Bob".into())); + struct2.insert("f1".into(), Value::Integer(20)); + + let mut map_value = ObjectMap::new(); + map_value.insert("item1".into(), Value::Object(struct1)); + map_value.insert("item2".into(), Value::Object(struct2)); + + let mut log = LogEvent::default(); + log.insert("map_with_structs", Value::Object(map_value)); + + let events = vec![Event::Log(log)]; + + // Define schema: Map + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Utf8, true), + Field::new("f1", DataType::Int32, true), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Struct(struct_fields), true), + ])), + false, + ); - let mut struct1 = ObjectMap::new(); - struct1.insert("f0".into(), Value::Integer(1)); - struct1.insert("f1".into(), Value::Object(attrs1)); + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "map_with_structs", + DataType::Map(map_entries.into(), false), + true, + )])); - let mut attrs2 = ObjectMap::new(); - attrs2.insert("color".into(), Value::Bytes("blue".into())); - attrs2.insert("size".into(), Value::Bytes("small".into())); + let batch = + encode_and_decode(events, schema).expect("Failed to encode map with struct values"); - let mut struct2 = ObjectMap::new(); - struct2.insert("f0".into(), Value::Integer(2)); - struct2.insert("f1".into(), Value::Object(attrs2)); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - let list_value = Value::Array(vec![Value::Object(struct1), Value::Object(struct2)]); + // Verify the map + let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert!(!map_array.is_null(0)); + let map_value = map_array.value(0); + assert_eq!(map_value.len(), 2); - let mut log = LogEvent::default(); - log.insert("list_of_structs_with_maps", list_value); - - let events = vec![Event::Log(log)]; - - // Define schema - let map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Utf8, true), - ])), - false, - ); + // Verify the struct values in the map + let struct_array = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_array.len(), 2); - let struct_fields = arrow::datatypes::Fields::from(vec![ - Field::new("f0", DataType::Int32, true), - Field::new("f1", DataType::Map(map_entries.into(), false), true), - ]); + // Check f0 field (names) + let names_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let name1 = names_array.value(0); + let name2 = names_array.value(1); + assert!(name1 == "Alice" || name1 == "Bob"); + assert!(name2 == "Alice" || name2 == "Bob"); + assert_ne!(name1, name2); + + // Check f1 field (counts) + let counts_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(counts_array.value(0) == 10 || counts_array.value(0) == 20); + assert!(counts_array.value(1) == 10 || counts_array.value(1) == 20); + } - let list_field = Field::new("item", DataType::Struct(struct_fields), true); + #[test] + fn test_encode_list_of_structs_containing_maps() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a list of structs, where each struct contains a map + // List }> + // [ + // {"f0": 1, "f1": {"color": "red", "size": "large"}}, + // {"f0": 2, "f1": {"color": "blue", "size": "small"}} + // ] + let mut attrs1 = ObjectMap::new(); + attrs1.insert("color".into(), Value::Bytes("red".into())); + attrs1.insert("size".into(), Value::Bytes("large".into())); + + let mut struct1 = ObjectMap::new(); + struct1.insert("f0".into(), Value::Integer(1)); + struct1.insert("f1".into(), Value::Object(attrs1)); + + let mut attrs2 = ObjectMap::new(); + attrs2.insert("color".into(), Value::Bytes("blue".into())); + attrs2.insert("size".into(), Value::Bytes("small".into())); + + let mut struct2 = ObjectMap::new(); + struct2.insert("f0".into(), Value::Integer(2)); + struct2.insert("f1".into(), Value::Object(attrs2)); + + let list_value = Value::Array(vec![Value::Object(struct1), Value::Object(struct2)]); + + let mut log = LogEvent::default(); + log.insert("list_of_structs_with_maps", list_value); + + let events = vec![Event::Log(log)]; + + // Define schema + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + ); - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "list_of_structs_with_maps", - DataType::List(list_field.into()), - true, - )])); + let struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("f0", DataType::Int32, true), + Field::new("f1", DataType::Map(map_entries.into(), false), true), + ]); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode list of structs with maps: {:?}", - result.err() - ); + let list_field = Field::new("item", DataType::Struct(struct_fields), true); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "list_of_structs_with_maps", + DataType::List(list_field.into()), + true, + )])); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + let batch = + encode_and_decode(events, schema).expect("Failed to encode list of structs with maps"); - // Verify the list - let list_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!list_array.is_null(0)); - let list_value = list_array.value(0); - assert_eq!(list_value.len(), 2); - - // Verify the structs in the list - let struct_array = list_value - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(struct_array.len(), 2); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - // Verify IDs (f0) - let id_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(id_array.value(0), 1); - assert_eq!(id_array.value(1), 2); - - // Verify maps (f1) - let map_array = struct_array - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(map_array.len(), 2); - assert!(!map_array.is_null(0)); - assert!(!map_array.is_null(1)); - - // Verify first map has 2 entries - let first_map = map_array.value(0); - assert_eq!(first_map.len(), 2); - - // Verify second map has 2 entries - let second_map = map_array.value(1); - assert_eq!(second_map.len(), 2); -} + // Verify the list + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 2); -#[test] -fn test_encode_deeply_nested_mixed_types() { - use arrow::array::{ListArray, MapArray}; - use vrl::value::ObjectMap; + // Verify the structs in the list + let struct_array = list_value + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_array.len(), 2); - // Create a very complex nested structure: - // Struct { - // data: List, metadata: Map }>> - // } - let mut metadata = ObjectMap::new(); - metadata.insert("key1".into(), Value::Bytes("value1".into())); + // Verify IDs (f0) + let id_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_array.value(0), 1); + assert_eq!(id_array.value(1), 2); - let mut inner_struct = ObjectMap::new(); - inner_struct.insert("f0".into(), Value::Array(vec![Value::Integer(100)])); - inner_struct.insert("f1".into(), Value::Object(metadata)); + // Verify maps (f1) + let map_array = struct_array + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(map_array.len(), 2); + assert!(!map_array.is_null(0)); + assert!(!map_array.is_null(1)); + + // Verify first map has 2 entries + let first_map = map_array.value(0); + assert_eq!(first_map.len(), 2); + + // Verify second map has 2 entries + let second_map = map_array.value(1); + assert_eq!(second_map.len(), 2); + } - let mut map_in_list = ObjectMap::new(); - map_in_list.insert("item_key".into(), Value::Object(inner_struct)); + #[test] + fn test_encode_deeply_nested_mixed_types() { + use arrow::array::{ListArray, MapArray}; + use vrl::value::ObjectMap; + + // Create a very complex nested structure: + // Struct { + // data: List, metadata: Map }>> + // } + let mut metadata = ObjectMap::new(); + metadata.insert("key1".into(), Value::Bytes("value1".into())); + + let mut inner_struct = ObjectMap::new(); + inner_struct.insert("f0".into(), Value::Array(vec![Value::Integer(100)])); + inner_struct.insert("f1".into(), Value::Object(metadata)); + + let mut map_in_list = ObjectMap::new(); + map_in_list.insert("item_key".into(), Value::Object(inner_struct)); + + let mut outer_struct = ObjectMap::new(); + outer_struct.insert("f0".into(), Value::Array(vec![Value::Object(map_in_list)])); + + let mut log = LogEvent::default(); + log.insert("deeply_nested", Value::Object(outer_struct)); + + let events = vec![Event::Log(log)]; + + // Define schema + let metadata_map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + ); - let mut outer_struct = ObjectMap::new(); - outer_struct.insert("f0".into(), Value::Array(vec![Value::Object(map_in_list)])); + let inner_struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new( + "f0", + DataType::List(Field::new("item", DataType::Int32, true).into()), + true, + ), + Field::new( + "f1", + DataType::Map(metadata_map_entries.into(), false), + true, + ), + ]); + + let map_entries = Field::new( + "entries", + DataType::Struct(arrow::datatypes::Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Struct(inner_struct_fields), true), + ])), + false, + ); - let mut log = LogEvent::default(); - log.insert("deeply_nested", Value::Object(outer_struct)); - - let events = vec![Event::Log(log)]; - - // Define schema - let metadata_map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Utf8, true), - ])), - false, - ); + let list_field = Field::new("item", DataType::Map(map_entries.into(), false), true); - let inner_struct_fields = arrow::datatypes::Fields::from(vec![ - Field::new( + let outer_struct_fields = arrow::datatypes::Fields::from(vec![Field::new( "f0", - DataType::List(Field::new("item", DataType::Int32, true).into()), + DataType::List(list_field.into()), true, - ), - Field::new( - "f1", - DataType::Map(metadata_map_entries.into(), false), + )]); + + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "deeply_nested", + DataType::Struct(outer_struct_fields), true, - ), - ]); - - let map_entries = Field::new( - "entries", - DataType::Struct(arrow::datatypes::Fields::from(vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Struct(inner_struct_fields), true), - ])), - false, - ); + )])); - let list_field = Field::new("item", DataType::Map(map_entries.into(), false), true); - - let outer_struct_fields = arrow::datatypes::Fields::from(vec![Field::new( - "f0", - DataType::List(list_field.into()), - true, - )]); - - let schema = SchemaRef::new(Schema::new(vec![Field::new( - "deeply_nested", - DataType::Struct(outer_struct_fields), - true, - )])); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::clone(&schema))); - assert!( - result.is_ok(), - "Failed to encode deeply nested mixed types: {:?}", - result.err() - ); + let batch = + encode_and_decode(events, schema).expect("Failed to encode deeply nested mixed types"); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 1); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); + // Verify the outer struct + let outer_struct = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!outer_struct.is_null(0)); - // Verify the outer struct - let outer_struct = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!outer_struct.is_null(0)); + // Verify the list inside the outer struct + let list_array = outer_struct + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + let list_value = list_array.value(0); + assert_eq!(list_value.len(), 1); - // Verify the list inside the outer struct - let list_array = outer_struct - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!list_array.is_null(0)); - let list_value = list_array.value(0); - assert_eq!(list_value.len(), 1); - - // Verify the map inside the list - let map_array = list_value.as_any().downcast_ref::().unwrap(); - assert_eq!(map_array.len(), 1); - assert!(!map_array.is_null(0)); - - // Verify the struct inside the map - let struct_values = map_array - .values() - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(struct_values.len(), 1); + // Verify the map inside the list + let map_array = list_value.as_any().downcast_ref::().unwrap(); + assert_eq!(map_array.len(), 1); + assert!(!map_array.is_null(0)); - // Verify the list inside the struct - let inner_list = struct_values - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!inner_list.is_null(0)); - let inner_list_value = inner_list.value(0); - assert_eq!(inner_list_value.len(), 1); - - // Verify the innermost map - let inner_map = struct_values - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!inner_map.is_null(0)); - let inner_map_value = inner_map.value(0); - assert_eq!(inner_map_value.len(), 1); -} + // Verify the struct inside the map + let struct_values = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(struct_values.len(), 1); -#[test] -fn test_automatic_json_serialization_for_array_of_objects() { - use vrl::value::ObjectMap; + // Verify the list inside the struct + let inner_list = struct_values + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!inner_list.is_null(0)); + let inner_list_value = inner_list.value(0); + assert_eq!(inner_list_value.len(), 1); - // Create array of objects (like the user's components data) - let mut obj1 = ObjectMap::new(); - obj1.insert("name".into(), Value::Bytes("tick.mexc.spot".into())); - obj1.insert("alias".into(), Value::Bytes("guiusdt".into())); - obj1.insert("expireAfter".into(), Value::Integer(60000)); + // Verify the innermost map + let inner_map = struct_values + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!inner_map.is_null(0)); + let inner_map_value = inner_map.value(0); + assert_eq!(inner_map_value.len(), 1); + } - let mut obj2 = ObjectMap::new(); - obj2.insert("name".into(), Value::Bytes("tick.binance".into())); - obj2.insert("alias".into(), Value::Bytes("btcusdt".into())); - obj2.insert("expireAfter".into(), Value::Integer(30000)); + #[test] + fn test_automatic_json_serialization_for_array_of_objects() { + use vrl::value::ObjectMap; - let components = Value::Array(vec![Value::Object(obj1), Value::Object(obj2)]); + // Create array of objects (like the user's components data) + let mut obj1 = ObjectMap::new(); + obj1.insert("name".into(), Value::Bytes("service.api.v1".into())); + obj1.insert("alias".into(), Value::Bytes("widget-alpha".into())); + obj1.insert("timeout".into(), Value::Integer(60000)); - let mut log = LogEvent::default(); - log.insert("components", components); - - let events = vec![Event::Log(log)]; - - // Schema expects Array(String), but we're providing Array(Object) - // The encoder should automatically serialize objects to JSON strings - let schema = Schema::new(vec![Field::new( - "components", - DataType::List(Field::new("item", DataType::Utf8, true).into()), - false, - )]); - - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::new(schema))); - assert!( - result.is_ok(), - "Encoding should succeed with automatic JSON serialization" - ); + let mut obj2 = ObjectMap::new(); + obj2.insert("name".into(), Value::Bytes("service.backend".into())); + obj2.insert("alias".into(), Value::Bytes("widget-beta".into())); + obj2.insert("timeout".into(), Value::Integer(30000)); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let components = Value::Array(vec![Value::Object(obj1), Value::Object(obj2)]); - assert_eq!(batch.num_rows(), 1); + let mut log = LogEvent::default(); + log.insert("components", components); - let list_array = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!list_array.is_null(0)); - - let list_value = list_array.value(0); - let string_array = list_value.as_any().downcast_ref::().unwrap(); - - // Should have 2 strings (JSON serialized objects) - assert_eq!(string_array.len(), 2); - - // Verify the first object was serialized to JSON - let json1 = string_array.value(0); - assert!(json1.contains("\"name\":\"tick.mexc.spot\"")); - assert!(json1.contains("\"alias\":\"guiusdt\"")); - assert!(json1.contains("\"expireAfter\":60000")); - - // Verify the second object was serialized to JSON - let json2 = string_array.value(1); - assert!(json2.contains("\"name\":\"tick.binance\"")); - assert!(json2.contains("\"alias\":\"btcusdt\"")); - assert!(json2.contains("\"expireAfter\":30000")); -} + let events = vec![Event::Log(log)]; -#[test] -fn test_object_in_map_values_to_string() { - use vrl::value::ObjectMap; + // Schema expects Array(String), but we're providing Array(Object) + // The encoder should automatically serialize objects to JSON strings + let schema = Schema::new(vec![Field::new( + "components", + DataType::List(Field::new("item", DataType::Utf8, true).into()), + false, + )]); - // Create a map with object values: Map - // Schema expects Map, so objects should serialize to JSON - let mut inner_obj = ObjectMap::new(); - inner_obj.insert("config".into(), Value::Bytes("enabled".into())); - inner_obj.insert("timeout".into(), Value::Integer(5000)); + let batch = encode_and_decode(events, Arc::new(schema)) + .expect("Encoding should succeed with automatic JSON serialization"); - let mut map_value = ObjectMap::new(); - map_value.insert("setting1".into(), Value::Object(inner_obj)); - map_value.insert("setting2".into(), Value::Bytes("simple string".into())); + assert_eq!(batch.num_rows(), 1); - let mut log = LogEvent::default(); - log.insert("settings", Value::Object(map_value)); + let list_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!list_array.is_null(0)); + + let list_value = list_array.value(0); + let string_array = list_value.as_any().downcast_ref::().unwrap(); + + // Should have 2 strings (JSON serialized objects) + assert_eq!(string_array.len(), 2); + + // Verify the first object was serialized to JSON + let json1 = string_array.value(0); + assert!(json1.contains("\"name\":\"service.api.v1\"")); + assert!(json1.contains("\"alias\":\"widget-alpha\"")); + assert!(json1.contains("\"timeout\":60000")); + + // Verify the second object was serialized to JSON + let json2 = string_array.value(1); + assert!(json2.contains("\"name\":\"service.backend\"")); + assert!(json2.contains("\"alias\":\"widget-beta\"")); + assert!(json2.contains("\"timeout\":30000")); + } - let events = vec![Event::Log(log)]; + #[test] + fn test_object_in_map_values_to_string() { + use vrl::value::ObjectMap; - // Schema: Map (expects string values, but we have objects) - let key_field = Field::new("keys", DataType::Utf8, false); - let value_field = Field::new("values", DataType::Utf8, true); - let entries_struct = DataType::Struct(Fields::from(vec![key_field, value_field])); - let entries_field = Field::new("entries", entries_struct, false); - let map_type = DataType::Map(entries_field.into(), false); + // Create a map with object values: Map + // Schema expects Map, so objects should serialize to JSON + let mut inner_obj = ObjectMap::new(); + inner_obj.insert("config".into(), Value::Bytes("enabled".into())); + inner_obj.insert("timeout".into(), Value::Integer(5000)); - let schema = Schema::new(vec![Field::new("settings", map_type, false)]); + let mut map_value = ObjectMap::new(); + map_value.insert("setting1".into(), Value::Object(inner_obj)); + map_value.insert("setting2".into(), Value::Bytes("simple string".into())); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::new(schema))); - assert!( - result.is_ok(), - "Map with object values should serialize to JSON strings" - ); + let mut log = LogEvent::default(); + log.insert("settings", Value::Object(map_value)); - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + let events = vec![Event::Log(log)]; - assert_eq!(batch.num_rows(), 1); + // Schema: Map (expects string values, but we have objects) + let key_field = Field::new("keys", DataType::Utf8, false); + let value_field = Field::new("values", DataType::Utf8, true); + let entries_struct = DataType::Struct(Fields::from(vec![key_field, value_field])); + let entries_field = Field::new("entries", entries_struct, false); + let map_type = DataType::Map(entries_field.into(), false); - let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); - assert!(!map_array.is_null(0)); + let schema = Schema::new(vec![Field::new("settings", map_type, false)]); - // Get the values from the map - let values_array = map_array - .values() - .as_any() - .downcast_ref::() - .unwrap(); - - // One value should be a JSON object, one should be a plain string - let mut found_json_object = false; - let mut found_plain_string = false; - - for i in 0..values_array.len() { - let value = values_array.value(i); - if value.contains("\"config\"") && value.contains("\"timeout\"") { - found_json_object = true; - } else if value == "simple string" { - found_plain_string = true; - } - } + let batch = encode_and_decode(events, Arc::new(schema)) + .expect("Map with object values should serialize to JSON strings"); - assert!( - found_json_object, - "Should find JSON-serialized object in map values" - ); - assert!(found_plain_string, "Should find plain string in map values"); -} + assert_eq!(batch.num_rows(), 1); -#[test] -fn test_nested_arrays_with_objects() { - use vrl::value::ObjectMap; + let map_array = batch.column(0).as_any().downcast_ref::().unwrap(); + assert!(!map_array.is_null(0)); - // Array of arrays, where inner arrays contain objects - let mut obj = ObjectMap::new(); - obj.insert("id".into(), Value::Integer(123)); + // Get the values from the map + let values_array = map_array + .values() + .as_any() + .downcast_ref::() + .unwrap(); + + // One value should be a JSON object, one should be a plain string + let mut found_json_object = false; + let mut found_plain_string = false; + + for i in 0..values_array.len() { + let value = values_array.value(i); + if value.contains("\"config\"") && value.contains("\"timeout\"") { + found_json_object = true; + } else if value == "simple string" { + found_plain_string = true; + } + } - let inner_array = Value::Array(vec![Value::Object(obj.clone())]); - let outer_array = Value::Array(vec![inner_array]); + assert!( + found_json_object, + "Should find JSON-serialized object in map values" + ); + assert!(found_plain_string, "Should find plain string in map values"); + } - let mut log = LogEvent::default(); - log.insert("nested", outer_array); + #[test] + fn test_nested_arrays_with_objects() { + use vrl::value::ObjectMap; - let events = vec![Event::Log(log)]; + // Array of arrays, where inner arrays contain objects + let mut obj = ObjectMap::new(); + obj.insert("id".into(), Value::Integer(123)); - // Schema: Array(Array(String)) - let inner_field = Field::new("item", DataType::Utf8, true); - let middle_field = Field::new("item", DataType::List(inner_field.into()), true); - let outer_list = DataType::List(middle_field.into()); + let inner_array = Value::Array(vec![Value::Object(obj.clone())]); + let outer_array = Value::Array(vec![inner_array]); - let schema = Schema::new(vec![Field::new("nested", outer_list, false)]); + let mut log = LogEvent::default(); + log.insert("nested", outer_array); - let result = encode_events_to_arrow_ipc_stream(&events, Some(Arc::new(schema))); - assert!( - result.is_ok(), - "Nested arrays with objects should serialize" - ); + let events = vec![Event::Log(log)]; - let bytes = result.unwrap(); - let cursor = Cursor::new(bytes); - let mut reader = StreamReader::try_new(cursor, None).unwrap(); - let batch = reader.next().unwrap().unwrap(); + // Schema: Array(Array(String)) + let inner_field = Field::new("item", DataType::Utf8, true); + let middle_field = Field::new("item", DataType::List(inner_field.into()), true); + let outer_list = DataType::List(middle_field.into()); - assert_eq!(batch.num_rows(), 1); + let schema = Schema::new(vec![Field::new("nested", outer_list, false)]); - // Navigate to the deepest array - let outer_list = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let outer_value = outer_list.value(0); - let middle_list = outer_value.as_any().downcast_ref::().unwrap(); - let middle_value = middle_list.value(0); - let inner_strings = middle_value.as_any().downcast_ref::().unwrap(); - - // Should have one JSON string - assert_eq!(inner_strings.len(), 1); - let json_str = inner_strings.value(0); - assert!( - json_str.contains("\"id\":123"), - "Deeply nested object should be serialized to JSON" - ); + let batch = encode_and_decode(events, Arc::new(schema)) + .expect("Nested arrays with objects should serialize"); + + assert_eq!(batch.num_rows(), 1); + + // Navigate to the deepest array + let outer_list = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let outer_value = outer_list.value(0); + let middle_list = outer_value.as_any().downcast_ref::().unwrap(); + let middle_value = middle_list.value(0); + let inner_strings = middle_value.as_any().downcast_ref::().unwrap(); + + // Should have one JSON string + assert_eq!(inner_strings.len(), 1); + let json_str = inner_strings.value(0); + assert!( + json_str.contains("\"id\":123"), + "Deeply nested object should be serialized to JSON" + ); + } } From d8c3dc194b5e1125bde08063acd826fe670f68e3 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 15:05:45 +0800 Subject: [PATCH 08/11] chore: update docs --- website/cue/reference/components/sinks/clickhouse.cue | 3 --- 1 file changed, 3 deletions(-) diff --git a/website/cue/reference/components/sinks/clickhouse.cue b/website/cue/reference/components/sinks/clickhouse.cue index 1049cf5217976..c0d80d1444388 100644 --- a/website/cue/reference/components/sinks/clickhouse.cue +++ b/website/cue/reference/components/sinks/clickhouse.cue @@ -142,9 +142,6 @@ components: sinks: clickhouse: { The following ClickHouse column types are **not yet supported** by Vector's ArrowStream implementation: - - `Array` - - `Tuple` - - `Map` - `IPv4` - `IPv6` From b07c11eb4e51d1a17b090a70e057e44edd5d8762 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 15:08:04 +0800 Subject: [PATCH 09/11] chore: add changelog fragment --- .../24074_clickhouse_arrow_complex_types.enhancement.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changelog.d/24074_clickhouse_arrow_complex_types.enhancement.md diff --git a/changelog.d/24074_clickhouse_arrow_complex_types.enhancement.md b/changelog.d/24074_clickhouse_arrow_complex_types.enhancement.md new file mode 100644 index 0000000000000..5a5d6c65ec212 --- /dev/null +++ b/changelog.d/24074_clickhouse_arrow_complex_types.enhancement.md @@ -0,0 +1,3 @@ +The `clickhouse` sink now supports complex data types (Array, Map, and Tuple) when using the `arrow_stream` format. + +authors: benjamin-awd From 2266d02cbe6f71b830994d482c59aad499cb3c61 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 15:40:14 +0800 Subject: [PATCH 10/11] chore: add support for named tuples --- .../src/encoding/format/arrow/builder.rs | 4 +- lib/codecs/src/encoding/format/arrow/tests.rs | 34 +++- src/sinks/clickhouse/arrow/parser.rs | 147 +++++++++++++++--- src/sinks/clickhouse/integration_tests.rs | 48 +++++- 4 files changed, 203 insertions(+), 30 deletions(-) diff --git a/lib/codecs/src/encoding/format/arrow/builder.rs b/lib/codecs/src/encoding/format/arrow/builder.rs index 262cb89f437ee..240edb9ec7a1c 100644 --- a/lib/codecs/src/encoding/format/arrow/builder.rs +++ b/lib/codecs/src/encoding/format/arrow/builder.rs @@ -314,7 +314,9 @@ fn append_value_to_builder( let struct_builder = downcast_builder!(builder, StructBuilder, field)?; for (i, field) in fields.iter().enumerate() { - let key = format!("f{}", i); + // Use the actual field name from the schema + // This supports both named tuples and unnamed tuples (which use "f0", "f1", etc.) + let key = field.name(); let field_builder = &mut struct_builder.field_builders_mut()[i]; match obj.get(key.as_str()) { Some(val) => append_value_to_builder(field_builder.as_mut(), val, field)?, diff --git a/lib/codecs/src/encoding/format/arrow/tests.rs b/lib/codecs/src/encoding/format/arrow/tests.rs index d3277219cc0c1..345da25c0b326 100644 --- a/lib/codecs/src/encoding/format/arrow/tests.rs +++ b/lib/codecs/src/encoding/format/arrow/tests.rs @@ -99,11 +99,16 @@ mod comprehensive { let now = Utc::now(); - // Create a struct (tuple) value + // Create a struct (tuple) value with unnamed fields let mut tuple_value = ObjectMap::new(); tuple_value.insert("f0".into(), Value::Bytes("nested_str".into())); tuple_value.insert("f1".into(), Value::Integer(999)); + // Create a named struct (named tuple) value + let mut named_tuple_value = ObjectMap::new(); + named_tuple_value.insert("category".into(), Value::Bytes("test_category".into())); + named_tuple_value.insert("tag".into(), Value::Bytes("test_tag".into())); + // Create a list value let list_value = Value::Array(vec![ Value::Integer(1), @@ -136,6 +141,7 @@ mod comprehensive { // Complex types log.insert("list_field", list_value); log.insert("struct_field", Value::Object(tuple_value)); + log.insert("named_struct_field", Value::Object(named_tuple_value)); log.insert("map_field", Value::Object(map_value)); let events = vec![Event::Log(log)]; @@ -146,6 +152,11 @@ mod comprehensive { Field::new("f1", DataType::Int64, true), ]); + let named_struct_fields = arrow::datatypes::Fields::from(vec![ + Field::new("category", DataType::Utf8, true), + Field::new("tag", DataType::Utf8, true), + ]); + let map_entries = Field::new( "entries", DataType::Struct(arrow::datatypes::Fields::from(vec![ @@ -181,13 +192,18 @@ mod comprehensive { true, ), Field::new("struct_field", DataType::Struct(struct_fields), true), + Field::new( + "named_struct_field", + DataType::Struct(named_struct_fields), + true, + ), Field::new("map_field", DataType::Map(map_entries.into(), false), true), ])); let batch = encode_and_decode(events, schema).expect("Failed to encode"); assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 18); + assert_eq!(batch.num_columns(), 19); // Verify all primitive types assert_eq!( @@ -260,7 +276,7 @@ mod comprehensive { assert_eq!(int_array.value(1), 2); assert_eq!(int_array.value(2), 3); - // Verify struct field + // Verify struct field (unnamed) let struct_array = batch .column(16) .as_any() @@ -270,9 +286,19 @@ mod comprehensive { assert_primitive_value!(struct_array, 0, 0, StringArray, "nested_str"); assert_primitive_value!(struct_array, 1, 0, Int64Array, 999); + // Verify named struct field (named tuple) + let named_struct_array = batch + .column(17) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!named_struct_array.is_null(0)); + assert_primitive_value!(named_struct_array, 0, 0, StringArray, "test_category"); + assert_primitive_value!(named_struct_array, 1, 0, StringArray, "test_tag"); + // Verify map field let map_array = batch - .column(17) + .column(18) .as_any() .downcast_ref::() .unwrap(); diff --git a/src/sinks/clickhouse/arrow/parser.rs b/src/sinks/clickhouse/arrow/parser.rs index 506a228e434b3..abf9a774374aa 100644 --- a/src/sinks/clickhouse/arrow/parser.rs +++ b/src/sinks/clickhouse/arrow/parser.rs @@ -19,8 +19,8 @@ pub enum ClickHouseType<'a> { LowCardinality(Box>), /// Array(T) Array(Box>), - /// Tuple(T1, T2, ...) - Tuple(Vec>), + /// Tuple(T1, T2, ...) or Tuple(name1 T1, name2 T2, ...) + Tuple(Vec<(Option<&'a str>, ClickHouseType<'a>)>), /// Map(K, V) Map(Box>, Box>), } @@ -98,9 +98,19 @@ impl<'a> ClickHouseType<'a> { let fields: Result, String> = elements .iter() .enumerate() - .map(|(i, elem)| { + .map(|(i, (field_name, elem))| { let (elem_arrow, elem_nullable) = elem.to_arrow()?; - Ok(Field::new(format!("f{}", i), elem_arrow, elem_nullable)) + let name = field_name.unwrap_or_else(|| { + // Use a static string slice that lives long enough + // For unnamed fields, we'll use format! below + "" + }); + let field_name = if name.is_empty() { + format!("f{}", i) + } else { + name.to_string() + }; + Ok(Field::new(field_name, elem_arrow, elem_nullable)) }) .collect(); Ok((DataType::Struct(Fields::from(fields?)), is_nullable)) @@ -149,7 +159,7 @@ pub fn parse_ch_type(ty: &str) -> ClickHouseType<'_> { "Tuple" => { let elements = parse_args(args_str) .into_iter() - .map(|arg| parse_ch_type(arg)) + .map(|arg| parse_tuple_element(arg)) .collect(); return ClickHouseType::Tuple(elements); } @@ -170,6 +180,42 @@ pub fn parse_ch_type(ty: &str) -> ClickHouseType<'_> { ClickHouseType::Primitive(ty) } +/// Helper: Finds the index of a delimiter, respecting nested parentheses/quotes. +fn find_delimiter(input: &str, delimiter: char) -> Option { + let mut depth = 0; + let mut in_quotes = false; + + for (i, c) in input.char_indices() { + match c { + '\'' => in_quotes = !in_quotes, + '(' if !in_quotes => depth += 1, + ')' if !in_quotes => depth -= 1, + c if c == delimiter && depth == 0 && !in_quotes => return Some(i), + _ => {} + } + } + None +} + +/// Parses a Tuple element which can be: +/// - Just a type: "String" -> (None, ClickHouseType::Primitive("String")) +/// - Named field: "category String" -> (Some("category"), ClickHouseType::Primitive("String")) +fn parse_tuple_element(element: &str) -> (Option<&str>, ClickHouseType<'_>) { + let element = element.trim(); + + // Use the helper to find the first space + if let Some(pos) = find_delimiter(element, ' ') { + let name = element[..pos].trim(); + let type_str = element[pos + 1..].trim(); + if !name.is_empty() && !type_str.is_empty() { + return (Some(name), parse_ch_type(type_str)); + } + } + + // No named field found, treat entire element as a type + (None, parse_ch_type(element)) +} + /// Tries to parse "TypeName(args)" into ("TypeName", "args"). fn try_parse_wrapper(ty: &str) -> Option<(&str, &str)> { let paren_pos = ty.find('(')?; @@ -183,16 +229,13 @@ fn try_parse_wrapper(ty: &str) -> Option<(&str, &str)> { Some((type_name, args)) } -/// Parses comma-separated arguments, respecting nesting and quotes. -/// Handles input with or without surrounding parentheses. -/// Examples: "Int32, String" or "(Int32, String)" both work. /// Parses comma-separated arguments, respecting nesting and quotes. /// Handles input with or without surrounding parentheses. /// Examples: "Int32, String" or "(Int32, String)" both work. fn parse_args(input: &str) -> Vec<&str> { let input = input.trim(); - // Strip parentheses if present + // Strip outer parens let input = if input.starts_with('(') && input.ends_with(')') { &input[1..input.len() - 1] } else { @@ -204,24 +247,16 @@ fn parse_args(input: &str) -> Vec<&str> { } let mut args = Vec::new(); - let mut start = 0; - let mut depth = 0; - let mut in_quotes = false; + let mut current = input; - for (i, c) in input.char_indices() { - match c { - '\'' => in_quotes = !in_quotes, - '(' if !in_quotes => depth += 1, - ')' if !in_quotes => depth -= 1, - ',' if depth == 0 && !in_quotes => { - args.push(input[start..i].trim()); - start = i + 1; - } - _ => {} - } + // Use the same helper to loop through commas + while let Some(pos) = find_delimiter(current, ',') { + args.push(current[..pos].trim()); + current = ¤t[pos + 1..]; } + // Push the remainder + args.push(current.trim()); - args.push(input[start..].trim()); args } @@ -807,4 +842,68 @@ mod tests { assert!(matches!(value_field.data_type(), DataType::Struct(_))); } } + + #[test] + fn test_named_tuple_fields() { + // Simple named tuple + let result = convert_type_no_metadata("Tuple(category String, tag String)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "category"); + assert_eq!(fields[1].name(), "tag"); + assert!(matches!(fields[0].data_type(), DataType::Utf8)); + assert!(matches!(fields[1].data_type(), DataType::Utf8)); + } else { + panic!("Expected Struct type"); + } + + // Array of named tuples (the original failing case) + let result = convert_type_no_metadata("Array(Tuple(category String, tag String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::List(inner) = dtype { + if let DataType::Struct(fields) = inner.data_type() { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "category"); + assert_eq!(fields[1].name(), "tag"); + assert!(matches!(fields[0].data_type(), DataType::Utf8)); + assert!(matches!(fields[1].data_type(), DataType::Utf8)); + } else { + panic!("Expected Struct type inside List"); + } + } else { + panic!("Expected List type"); + } + + // Mixed named and unnamed (named fields take precedence) + let result = convert_type_no_metadata("Tuple(id Int64, data String)"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "id"); + assert_eq!(fields[1].name(), "data"); + assert!(matches!(fields[0].data_type(), DataType::Int64)); + assert!(matches!(fields[1].data_type(), DataType::Utf8)); + } else { + panic!("Expected Struct type"); + } + + // Named tuple with complex types + let result = + convert_type_no_metadata("Tuple(items Array(Int32), metadata Map(String, String))"); + assert!(result.is_ok()); + let (dtype, _) = result.unwrap(); + if let DataType::Struct(fields) = dtype { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "items"); + assert_eq!(fields[1].name(), "metadata"); + assert!(matches!(fields[0].data_type(), DataType::List(_))); + assert!(matches!(fields[1].data_type(), DataType::Map(_, _))); + } else { + panic!("Expected Struct type"); + } + } } diff --git a/src/sinks/clickhouse/integration_tests.rs b/src/sinks/clickhouse/integration_tests.rs index bdcde12569dad..a63e4a9e91e79 100644 --- a/src/sinks/clickhouse/integration_tests.rs +++ b/src/sinks/clickhouse/integration_tests.rs @@ -657,7 +657,8 @@ async fn test_complex_types() { response_metrics Tuple(Int32, Int64, Float64), \ tags Array(String), \ user_properties Map(String, Array(String)), \ - array_with_nulls Array(Nullable(Int32))", + array_with_nulls Array(Nullable(Int32)), \ + array_with_named_tuple Array(Tuple(category String, tag String))", ) .await; @@ -850,6 +851,32 @@ async fn test_complex_types() { ]), ); + // Named tuple array - tests that named fields work correctly + let mut named_tuple1 = vector_lib::event::ObjectMap::new(); + named_tuple1.insert( + "category".into(), + vector_lib::event::Value::Bytes("priority".into()), + ); + named_tuple1.insert("tag".into(), vector_lib::event::Value::Bytes("high".into())); + + let mut named_tuple2 = vector_lib::event::ObjectMap::new(); + named_tuple2.insert( + "category".into(), + vector_lib::event::Value::Bytes("environment".into()), + ); + named_tuple2.insert( + "tag".into(), + vector_lib::event::Value::Bytes("production".into()), + ); + + event1.insert( + "array_with_named_tuple", + vector_lib::event::Value::Array(vec![ + vector_lib::event::Value::Object(named_tuple1), + vector_lib::event::Value::Object(named_tuple2), + ]), + ); + events.push(event1.into()); // Event 2: Empty and edge cases @@ -928,6 +955,10 @@ async fn test_complex_types() { vector_lib::event::Value::Object(empty_map), ); event2.insert("array_with_nulls", vector_lib::event::Value::Array(vec![])); + event2.insert( + "array_with_named_tuple", + vector_lib::event::Value::Array(vec![]), + ); events.push(event2.into()); @@ -1072,6 +1103,21 @@ async fn test_complex_types() { vector_lib::event::Value::Array(vec![vector_lib::event::Value::Integer(42)]), ); + // Named tuple with single element + let mut named_tuple3 = vector_lib::event::ObjectMap::new(); + named_tuple3.insert( + "category".into(), + vector_lib::event::Value::Bytes("status".into()), + ); + named_tuple3.insert( + "tag".into(), + vector_lib::event::Value::Bytes("active".into()), + ); + event3.insert( + "array_with_named_tuple", + vector_lib::event::Value::Array(vec![vector_lib::event::Value::Object(named_tuple3)]), + ); + events.push(event3.into()); run_and_assert_sink_compliance(sink, stream::iter(events), &SINK_TAGS).await; From 905a059b7681b588b9625fdf04755eb42b1dc597 Mon Sep 17 00:00:00 2001 From: benjamin-awd Date: Wed, 24 Dec 2025 16:42:26 +0800 Subject: [PATCH 11/11] chore: improve handling for to_arrow --- src/sinks/clickhouse/arrow/parser.rs | 77 ++++++++++++---------------- 1 file changed, 32 insertions(+), 45 deletions(-) diff --git a/src/sinks/clickhouse/arrow/parser.rs b/src/sinks/clickhouse/arrow/parser.rs index abf9a774374aa..3b79994ea4968 100644 --- a/src/sinks/clickhouse/arrow/parser.rs +++ b/src/sinks/clickhouse/arrow/parser.rs @@ -1,7 +1,6 @@ //! ClickHouse type parsing and conversion to Arrow types. use arrow::datatypes::{DataType, Field, Fields, TimeUnit}; -use std::sync::Arc; const DECIMAL32_PRECISION: u8 = 9; const DECIMAL64_PRECISION: u8 = 18; @@ -51,10 +50,10 @@ impl<'a> ClickHouseType<'a> { pub fn to_arrow(&self) -> Result<(DataType, bool), String> { let is_nullable = self.is_nullable(); - match self.base_type() { + let data_type = match self.base_type() { ClickHouseType::Primitive(name) => { let (type_name, _) = extract_identifier(name); - let data_type = match type_name { + match type_name { // Numeric "Int8" => DataType::Int8, "Int16" => DataType::Int16, @@ -74,69 +73,57 @@ impl<'a> ClickHouseType<'a> { // Strings "String" | "FixedString" => DataType::Utf8, - // Date and time types (timezones not currently handled, defaults to UTC) + // Date and time "Date" | "Date32" => DataType::Date32, "DateTime" => DataType::Timestamp(TimeUnit::Second, None), "DateTime64" => parse_datetime64_precision(name)?, - // Unknown - _ => { - return Err(format!( - "Unknown ClickHouse type '{}'. This type cannot be automatically converted.", - type_name - )); - } - }; - Ok((data_type, is_nullable)) + _ => return Err(format!("Unknown ClickHouse type '{}'", type_name)), + } } ClickHouseType::Array(inner) => { let (inner_arrow, inner_nullable) = inner.to_arrow()?; - let field = Field::new("item", inner_arrow, inner_nullable); - Ok((DataType::List(Arc::new(field)), is_nullable)) + DataType::List(Field::new("item", inner_arrow, inner_nullable).into()) } ClickHouseType::Tuple(elements) => { - let fields: Result, String> = elements + let fields: Vec = elements .iter() .enumerate() - .map(|(i, (field_name, elem))| { - let (elem_arrow, elem_nullable) = elem.to_arrow()?; - let name = field_name.unwrap_or_else(|| { - // Use a static string slice that lives long enough - // For unnamed fields, we'll use format! below - "" - }); - let field_name = if name.is_empty() { - format!("f{}", i) - } else { - name.to_string() - }; - Ok(Field::new(field_name, elem_arrow, elem_nullable)) + .map(|(i, (name_opt, elem))| { + let (dt, nullable) = elem.to_arrow()?; + + let name = name_opt + .as_deref() + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .unwrap_or_else(|| format!("f{}", i)); + + Ok(Field::new(name, dt, nullable)) }) - .collect(); - Ok((DataType::Struct(Fields::from(fields?)), is_nullable)) + .collect::>()?; + + DataType::Struct(Fields::from(fields)) } ClickHouseType::Map(key_type, value_type) => { - // Validate key is String let (key_arrow, _) = key_type.to_arrow()?; + if !matches!(key_arrow, DataType::Utf8) { - return Err( - "Map keys must be String type. Vector's ObjectMap only supports String keys." - .to_string(), - ); + return Err("Map keys must be String type.".to_string()); } - // Recursively convert value type let (value_arrow, value_nullable) = value_type.to_arrow()?; - // Arrow Map is represented as Map - let key_field = Field::new("keys", DataType::Utf8, false); - let value_field = Field::new("values", value_arrow, value_nullable); - let entries_struct = DataType::Struct(Fields::from(vec![key_field, value_field])); - let entries_field = Field::new("entries", entries_struct, false); - Ok((DataType::Map(Arc::new(entries_field), false), is_nullable)) + let entries = DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", value_arrow, value_nullable), + ])); + + DataType::Map(Field::new("entries", entries, false).into(), false) } - _ => Err("Unsupported ClickHouse type".to_string()), - } + _ => return Err("Unsupported ClickHouse type".to_string()), + }; + + Ok((data_type, is_nullable)) } }