diff --git a/doc/_quarto.yml b/doc/_quarto.yml index a5801133..f5684647 100644 --- a/doc/_quarto.yml +++ b/doc/_quarto.yml @@ -39,6 +39,8 @@ website: href: syntax/clause/visualise.qmd - text: "`DRAW`" href: syntax/clause/draw.qmd + - text: "`PLACE`" + href: syntax/clause/place.qmd - text: "`SCALE`" href: syntax/clause/scale.qmd - text: "`FACET`" @@ -69,6 +71,8 @@ website: href: syntax/clause/visualise.qmd - text: "`DRAW`" href: syntax/clause/draw.qmd + - text: "`PLACE`" + href: syntax/clause/place.qmd - text: "`SCALE`" href: syntax/clause/scale.qmd - text: "`FACET`" diff --git a/doc/syntax/clause/place.qmd b/doc/syntax/clause/place.qmd new file mode 100644 index 00000000..6321bbed --- /dev/null +++ b/doc/syntax/clause/place.qmd @@ -0,0 +1,37 @@ +--- +title: "Create annotation layers with `PLACE`" +--- + +The `PLACE` clause is the little sibling of the [`DRAW` clause](../clause/draw.qmd) that also creates layers. +A layer created with `PLACE` has no mappings to data, all aesthetics are set using literal values instead. + +## Clause syntax +The `PLACE` clause takes just a type and a setting clause, all of them required. + +```ggsql +PLACE + SETTING => , ... +``` + +Like `DRAW`, the layer type is required and specifies the type of layer to draw, like `point` or `text`. +It defines how the remaining settings are interpreted. +The [main syntax page](../index.qmd#layers) has a list of all available layer types + +Unlike the `DRAW` clause, the `PLACE` clause does not support `FILTER`, `PARTITION BY`, and `ORDER BY` clauses since +everything is declared inline. + +### `SETTING` +```ggsql +SETTING => , ... +``` + +The `SETTING` clause can be used for to different things: + +* *Setting aesthetics*: All aesthetics in `PLACE` layers are specified using literal value, e.g. 'red' (as in the color red). +Aesthetics that are set will not go through a scale but will use the provided value as-is. +You cannot set an aesthetic to a column, only to a literal values. +Contrary to `DRAW` layers, `PLACE` layers can take multiple literal values in an array. +* *Setting parameters*: Some layers take additional arguments that control how they behave. +Often, but not always, these modify the statistical transformation in some way. +An example would be the binwidth parameter in histogram which controls the width of each bin during histogram calculation. +This is not a statistical property since it is not related to each record, but to the calculation as a whole. diff --git a/doc/syntax/index.qmd b/doc/syntax/index.qmd index 1400280f..c58fa89e 100644 --- a/doc/syntax/index.qmd +++ b/doc/syntax/index.qmd @@ -7,6 +7,7 @@ ggsql augments the standard SQL syntax with a number of new clauses to describe - [`VISUALISE`](clause/visualise.qmd) initiates the visualisation part of the query - [`DRAW`](clause/draw.qmd) adds a new layer to the visualisation + - [`PLACE`](clause/place.qmd) adds an annotation layer - [`SCALE`](clause/scale.qmd) specify how an aesthetic should be scaled - [`FACET`](clause/facet.qmd) describes how data should be split into small multiples - [`PROJECT`](clause/project.qmd) is used for selecting the coordinate system to use diff --git a/src/execute/casting.rs b/src/execute/casting.rs index de370d86..403ff606 100644 --- a/src/execute/casting.rs +++ b/src/execute/casting.rs @@ -4,7 +4,7 @@ //! scale requirements and updating type info accordingly. use crate::plot::scale::coerce_dtypes; -use crate::plot::{CastTargetType, Layer, ParameterValue, Plot, SqlTypeNames}; +use crate::plot::{CastTargetType, Layer, Plot, SqlTypeNames}; use crate::{naming, DataSource}; use polars::prelude::{DataType, TimeUnit}; use std::collections::{HashMap, HashSet}; @@ -22,24 +22,6 @@ pub struct TypeRequirement { pub sql_type_name: String, } -/// Format a literal value as SQL -pub fn literal_to_sql(lit: &ParameterValue) -> String { - match lit { - ParameterValue::String(s) => format!("'{}'", s.replace('\'', "''")), - ParameterValue::Number(n) => n.to_string(), - ParameterValue::Boolean(b) => { - if *b { - "TRUE".to_string() - } else { - "FALSE".to_string() - } - } - ParameterValue::Array(_) | ParameterValue::Null => { - unreachable!("Grammar prevents arrays and null in literal aesthetic mappings") - } - } -} - /// Determine which columns need casting based on scale requirements. /// /// For each layer, collects columns that need casting to match the scale's @@ -223,6 +205,23 @@ pub fn determine_layer_source( Some(DataSource::FilePath(path)) => { format!("'{}'", path) } + Some(DataSource::Annotation(n)) => { + // Annotation layer - generate a dummy table with n rows. + // The execution pipeline expects all layers to have a DataFrame, even though + // SETTING literals eventually render as Vega-Lite datum values ({"value": ...}) + // that don't reference the data. The n-row dummy satisfies schema detection, + // type resolution, and other intermediate steps that expect data to exist. + if *n == 1 { + "(SELECT 1 AS __ggsql_dummy__)".to_string() + } else { + // Generate VALUES clause with n rows: (VALUES (1), (2), ..., (n)) AS t(col) + let rows: Vec = (1..=*n).map(|i| format!("({})", i)).collect(); + format!( + "(SELECT * FROM (VALUES {}) AS t(__ggsql_dummy__))", + rows.join(", ") + ) + } + } None => { // Layer uses global data - caller must ensure has_global is true debug_assert!(has_global, "Layer has no source and no global data"); diff --git a/src/execute/layer.rs b/src/execute/layer.rs index d4d911fb..fcbe46c1 100644 --- a/src/execute/layer.rs +++ b/src/execute/layer.rs @@ -11,7 +11,7 @@ use crate::{naming, DataFrame, GgsqlError, Result}; use polars::prelude::DataType; use std::collections::{HashMap, HashSet}; -use super::casting::{literal_to_sql, TypeRequirement}; +use super::casting::TypeRequirement; use super::schema::build_aesthetic_schema; /// Build the source query for a layer. @@ -90,7 +90,7 @@ pub fn build_layer_select_list( } AestheticValue::Literal(lit) => { // Literals become columns with prefixed aesthetic name - format!("{} AS \"{}\"", literal_to_sql(lit), aes_col_name) + format!("{} AS \"{}\"", lit.to_sql(), aes_col_name) } }; @@ -498,6 +498,7 @@ where name: prefixed_name, original_name, is_dummy, + is_scaled: true, }; layer.mappings.insert(aesthetic.clone(), value); } diff --git a/src/execute/mod.rs b/src/execute/mod.rs index a8f17e07..6de7bda1 100644 --- a/src/execute/mod.rs +++ b/src/execute/mod.rs @@ -26,7 +26,7 @@ use crate::parser; use crate::plot::aesthetic::{is_positional_aesthetic, AestheticContext}; use crate::plot::facet::{resolve_properties as resolve_facet_properties, FacetDataContext}; use crate::plot::{AestheticValue, Layer, Scale, ScaleTypeKind, Schema}; -use crate::{DataFrame, GgsqlError, Plot, Result}; +use crate::{DataFrame, DataSource, GgsqlError, Plot, Result}; use std::collections::{HashMap, HashSet}; use crate::reader::Reader; @@ -153,6 +153,11 @@ fn validate(layers: &[Layer], layer_schemas: &[Schema]) -> Result<()> { fn merge_global_mappings_into_layers(specs: &mut [Plot], layer_schemas: &[Schema]) { for spec in specs { for (layer, schema) in spec.layers.iter_mut().zip(layer_schemas.iter()) { + // Skip annotation layers - they don't inherit global mappings + if matches!(layer.source, Some(DataSource::Annotation(_))) { + continue; + } + let supported = layer.geom.aesthetics().supported(); let schema_columns: HashSet<&str> = schema.iter().map(|c| c.name.as_str()).collect(); @@ -303,6 +308,7 @@ fn add_facet_mappings_to_layers( name: var.to_string(), original_name: Some(var.to_string()), is_dummy: false, + is_scaled: true, }, ); } @@ -1225,6 +1231,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &R) -> Result 2, y => 25, label => 'Annotation', size => 14 + "#; + + let result = prepare_data_with_reader(query, &reader).unwrap(); + + assert_eq!(result.specs.len(), 1); + assert_eq!( + result.specs[0].layers.len(), + 2, + "Should have DRAW + PLACE layers" + ); + + // First layer: regular DRAW point + let point_layer = &result.specs[0].layers[0]; + assert_eq!(point_layer.geom, crate::Geom::point()); + assert!( + point_layer.source.is_none(), + "DRAW layer should have no explicit source" + ); + + // Second layer: PLACE text annotation + let annotation_layer = &result.specs[0].layers[1]; + assert_eq!(annotation_layer.geom, crate::Geom::text()); + assert!( + matches!(annotation_layer.source, Some(DataSource::Annotation(_))), + "PLACE layer should have Annotation source" + ); + + // Verify annotation layer has 1-row data + let annotation_key = annotation_layer.data_key.as_ref().unwrap(); + let annotation_df = result.data.get(annotation_key).unwrap(); + assert_eq!( + annotation_df.height(), + 1, + "Annotation layer should have exactly 1 row" + ); + + // Verify positional aesthetics are moved from SETTING to mappings with transformed names + // They become Column references (not Literals) so they can participate in scale training + assert!( + matches!( + annotation_layer.mappings.get("pos1"), + Some(AestheticValue::Column { name, .. }) if name == "__ggsql_aes_pos1__" + ), + "x should be transformed to pos1, moved to mappings, and materialized as column" + ); + assert!( + matches!( + annotation_layer.mappings.get("pos2"), + Some(AestheticValue::Column { name, .. }) if name == "__ggsql_aes_pos2__" + ), + "y should be transformed to pos2, moved to mappings, and materialized as column" + ); + + // Verify non-positional aesthetics are in mappings as Literals (via resolve_aesthetics) + assert!( + matches!( + annotation_layer.mappings.get("label"), + Some(AestheticValue::Literal(ParameterValue::String(s))) if s == "Annotation" + ), + "label should be in mappings as Literal" + ); + assert!( + matches!( + annotation_layer.mappings.get("size"), + Some(AestheticValue::Literal(ParameterValue::Number(n))) if *n == 14.0 + ), + "size should be in mappings as Literal" + ); + + // Parameters should be empty after resolve_aesthetics moves them to mappings + assert!( + annotation_layer.parameters.is_empty(), + "All SETTING parameters should be moved to mappings" + ); + } + + #[cfg(feature = "duckdb")] + #[test] + fn test_place_missing_required_aesthetic() { + let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap(); + + let query = r#" + SELECT 1 AS x, 2 AS y + VISUALISE x, y + DRAW point + PLACE text SETTING x => 5, label => 'Missing y!' + "#; + + let result = prepare_data_with_reader(query, &reader); + assert!(result.is_err(), "Should fail validation"); + + match result { + Err(GgsqlError::ValidationError(msg)) => { + assert!( + msg.contains("pos2"), + "Error should mention missing pos2 aesthetic: {}", + msg + ); + } + Err(e) => panic!("Expected ValidationError, got: {}", e), + Ok(_) => panic!("Expected error, got success"), + } + } + + #[cfg(feature = "duckdb")] + #[test] + fn test_place_affects_scale_ranges() { + let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap(); + + // Data has x from 1-3, but PLACE annotation at x=10 should extend the range + let query = r#" + SELECT 1 AS x, 10 AS y UNION ALL + SELECT 2 AS x, 20 AS y UNION ALL + SELECT 3 AS x, 30 AS y + VISUALISE x, y + DRAW point + PLACE text SETTING x => 10, y => 50, label => 'Extended' + "#; + + let result = prepare_data_with_reader(query, &reader); + assert!(result.is_ok(), "Query should execute: {:?}", result.err()); + + let prep = result.unwrap(); + + // Check that x scale input_range includes both data range (1-3) and annotation point (10) + let x_scale = prep.specs[0].find_scale("pos1"); + assert!(x_scale.is_some(), "Should have x scale"); + + let scale = x_scale.unwrap(); + if let Some(input_range) = &scale.input_range { + // Input range should include the annotation x value (10) + // The scale input_range is resolved from the combined data + annotation DataFrames + assert!( + input_range.len() >= 2, + "Scale input_range should have min/max values" + ); + // Check that the range max is at least 10 (the annotation x value) + if let Some(max_val) = input_range.last() { + match max_val { + crate::plot::types::ArrayElement::Number(n) => { + assert!( + *n >= 10.0, + "Scale input_range max should include annotation point at x=10, got: {}", + n + ); + } + _ => panic!("Expected numeric input_range value"), + } + } + } else { + panic!("Scale should have an input_range"); + } + } + + #[cfg(feature = "duckdb")] + #[test] + fn test_place_array_recycling_scalar() { + let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap(); + + let query = r#" + SELECT 1 AS x, 10 AS y + VISUALISE x, y + DRAW point + PLACE text SETTING x => [1, 2, 3], y => 10, label => 'Same' + "#; + + let result = prepare_data_with_reader(query, &reader).unwrap(); + + let annotation_layer = &result.specs[0].layers[1]; + assert_eq!( + annotation_layer.source, + Some(DataSource::Annotation(3)), + "Should recycle scalar y to length 3" + ); + + let annotation_key = annotation_layer.data_key.as_ref().unwrap(); + let annotation_df = result.data.get(annotation_key).unwrap(); + assert_eq!(annotation_df.height(), 3, "Should have 3 rows"); + } + + #[cfg(feature = "duckdb")] + #[test] + fn test_place_array_mismatched_lengths_error() { + let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap(); + + let query = r#" + SELECT 1 AS x, 10 AS y + VISUALISE x, y + DRAW point + PLACE text SETTING x => [1, 2, 3], y => [10, 20], label => 'Test' + "#; + + let result = prepare_data_with_reader(query, &reader); + assert!(result.is_err(), "Should error on mismatched array lengths"); + + match result { + Err(GgsqlError::ValidationError(msg)) => { + assert!( + msg.contains("mismatched array lengths"), + "Error should mention mismatched lengths: {}", + msg + ); + } + Err(e) => panic!("Expected ValidationError, got: {}", e), + Ok(_) => panic!("Expected error, got success"), + } + } + + #[cfg(feature = "duckdb")] + #[test] + fn test_place_no_global_mapping_inheritance() { + let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap(); + + let query = r#" + SELECT 1 AS x, 2 AS y, 'red' AS color + VISUALISE x, y, color + DRAW point + PLACE text SETTING x => 5, y => 10, label => 'Test' + "#; + + let result = prepare_data_with_reader(query, &reader).unwrap(); + + // DRAW layer should have inherited global color mapping + let point_layer = &result.specs[0].layers[0]; + assert!( + point_layer.mappings.contains_key("color") + || point_layer.mappings.contains_key("fill") + || point_layer.mappings.contains_key("stroke"), + "DRAW layer should inherit color from global mappings" + ); + + // PLACE layer should NOT have inherited global mappings + let annotation_layer = &result.specs[0].layers[1]; + assert!( + !annotation_layer.mappings.contains_key("color"), + "PLACE layer should not inherit color from global mappings" + ); + assert!( + !annotation_layer.mappings.contains_key("fill"), + "PLACE layer should not inherit fill from global mappings" + ); + assert!( + !annotation_layer.mappings.contains_key("stroke"), + "PLACE layer should not inherit stroke from global mappings" + ); + } } diff --git a/src/execute/scale.rs b/src/execute/scale.rs index 3ad4cf2b..b8127e90 100644 --- a/src/execute/scale.rs +++ b/src/execute/scale.rs @@ -1005,9 +1005,16 @@ pub fn find_columns_for_aesthetic<'a>( for (i, layer) in layers.iter().enumerate() { if let Some(df) = data_map.get(&naming::layer_key(i)) { for aes_name in &aesthetics_to_check { - if let Some(AestheticValue::Column { name, .. }) = layer.mappings.get(aes_name) { - if let Ok(column) = df.column(name) { - column_refs.push(column); + if let Some(AestheticValue::Column { + name, is_scaled, .. + }) = layer.mappings.get(aes_name) + { + // Only train scales for columns that should be scaled + // Annotation layer literals have is_scaled: false + if *is_scaled { + if let Ok(column) = df.column(name) { + column_refs.push(column); + } } } } diff --git a/src/execute/schema.rs b/src/execute/schema.rs index b5b7cad8..c59b390a 100644 --- a/src/execute/schema.rs +++ b/src/execute/schema.rs @@ -6,9 +6,9 @@ //! 2. Apply casting to queries //! 3. complete_schema_ranges() - get min/max from cast queries -use crate::plot::{AestheticValue, ColumnInfo, Layer, ParameterValue, Schema}; +use crate::plot::{AestheticValue, ArrayElement, ColumnInfo, Layer, ParameterValue, Schema}; use crate::{naming, DataFrame, Result}; -use polars::prelude::DataType; +use polars::prelude::{DataType, TimeUnit}; /// Simple type info tuple: (name, dtype, is_discrete) pub type TypeInfo = (String, DataType, bool); @@ -246,16 +246,37 @@ pub fn add_literal_columns_to_type_info(layers: &[Layer], layer_type_info: &mut for (layer, type_info) in layers.iter().zip(layer_type_info.iter_mut()) { for (aesthetic, value) in &layer.mappings.aesthetics { if let AestheticValue::Literal(lit) = value { - let dtype = match lit { - ParameterValue::String(_) => DataType::String, - ParameterValue::Number(_) => DataType::Float64, - ParameterValue::Boolean(_) => DataType::Boolean, - ParameterValue::Array(_) | ParameterValue::Null => unreachable!( - "Grammar prevents arrays and null in literal aesthetic mappings" - ), + let (dtype, is_discrete) = match lit { + ParameterValue::String(_) => (DataType::String, true), + ParameterValue::Number(_) => (DataType::Float64, false), + ParameterValue::Boolean(_) => (DataType::Boolean, true), + ParameterValue::Array(arr) => { + // Infer dtype from first element (arrays are homogeneous) + if let Some(first) = arr.first() { + match first { + ArrayElement::String(_) => (DataType::String, true), + ArrayElement::Number(_) => (DataType::Float64, false), + ArrayElement::Boolean(_) => (DataType::Boolean, true), + ArrayElement::Date(_) => (DataType::Date, false), + ArrayElement::DateTime(_) => { + (DataType::Datetime(TimeUnit::Microseconds, None), false) + } + ArrayElement::Time(_) => (DataType::Time, false), + ArrayElement::Null => { + // Null element: default to Float64 + (DataType::Float64, false) + } + } + } else { + // Empty array: default to Float64 + (DataType::Float64, false) + } + } + ParameterValue::Null => { + // Skip null literals - they don't create columns + continue; + } }; - let is_discrete = - matches!(lit, ParameterValue::String(_) | ParameterValue::Boolean(_)); let col_name = naming::aesthetic_column(aesthetic); // Only add if not already present @@ -310,21 +331,41 @@ pub fn build_aesthetic_schema(layer: &Layer, schema: &Schema) -> Schema { } AestheticValue::Literal(lit) => { // Literals become columns with appropriate type - let dtype = match lit { - ParameterValue::String(_) => DataType::String, - ParameterValue::Number(_) => DataType::Float64, - ParameterValue::Boolean(_) => DataType::Boolean, - ParameterValue::Array(_) | ParameterValue::Null => unreachable!( - "Grammar prevents arrays and null in literal aesthetic mappings" - ), + let (dtype, is_discrete) = match lit { + ParameterValue::String(_) => (DataType::String, true), + ParameterValue::Number(_) => (DataType::Float64, false), + ParameterValue::Boolean(_) => (DataType::Boolean, true), + ParameterValue::Array(arr) => { + // Infer dtype from first element (arrays are homogeneous) + if let Some(first) = arr.first() { + match first { + ArrayElement::String(_) => (DataType::String, true), + ArrayElement::Number(_) => (DataType::Float64, false), + ArrayElement::Boolean(_) => (DataType::Boolean, true), + ArrayElement::Date(_) => (DataType::Date, false), + ArrayElement::DateTime(_) => { + (DataType::Datetime(TimeUnit::Microseconds, None), false) + } + ArrayElement::Time(_) => (DataType::Time, false), + ArrayElement::Null => { + // Null element: default to Float64 + (DataType::Float64, false) + } + } + } else { + // Empty array: default to Float64 + (DataType::Float64, false) + } + } + ParameterValue::Null => { + // Null: default to Float64 + (DataType::Float64, false) + } }; aesthetic_schema.push(ColumnInfo { name: aes_col_name, dtype, - is_discrete: matches!( - lit, - ParameterValue::String(_) | ParameterValue::Boolean(_) - ), + is_discrete, min: None, max: None, }); diff --git a/src/lib.rs b/src/lib.rs index 6e3b8074..f968a673 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -758,6 +758,72 @@ mod integration_tests { ); } + #[test] + fn test_end_to_end_place_field_vs_value_encoding() { + // Test that PLACE annotation layers render correctly: + // - Positional aesthetics (x, y) as field encodings (reference columns) + // - Non-positional aesthetics (size, stroke) as value encodings (datum values) + + let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap(); + + let query = r#" + SELECT 1 AS x, 10 AS y UNION ALL SELECT 2 AS x, 20 AS y + VISUALISE x, y + DRAW point + PLACE text SETTING x => 5, y => 30, label => 'Annotation', size => 16, stroke => 'red' + "#; + + let prepared = execute::prepare_data_with_reader(query, &reader).unwrap(); + + // Render to Vega-Lite + let writer = VegaLiteWriter::new(); + let json_str = writer.write(&prepared.specs[0], &prepared.data).unwrap(); + let vl_spec: serde_json::Value = serde_json::from_str(&json_str).unwrap(); + + // Find the text layer (should be second layer) + let text_layer = &vl_spec["layer"][1]; + + // Mark can be either a string or an object with type + let mark_type = if text_layer["mark"].is_string() { + text_layer["mark"].as_str().unwrap() + } else { + text_layer["mark"]["type"].as_str().unwrap() + }; + assert_eq!(mark_type, "text", "Second layer should be text"); + + let encoding = &text_layer["encoding"]; + + // Positional aesthetics should be field encodings (have "field" key) + assert!( + encoding["x"]["field"].is_string(), + "x should be a field encoding: {:?}", + encoding["x"] + ); + assert!( + encoding["y"]["field"].is_string(), + "y should be a field encoding: {:?}", + encoding["y"] + ); + + // Non-positional aesthetics should be value encodings (have "value" key) + // Note: size may be scaled/transformed, so just check it's present as a value + assert!( + encoding["size"]["value"].is_number(), + "size should be a value encoding with numeric value: {:?}", + encoding["size"] + ); + + // Note: stroke color goes through resolve_aesthetics and may be in different location + // Just verify it's present somewhere as a literal + let text_mark = &text_layer["mark"]; + let has_stroke_value = encoding + .get("stroke") + .and_then(|s| s.get("value")) + .is_some() + || text_mark.get("stroke").is_some(); + assert!(has_stroke_value, "stroke should be present as a value"); + } + #[test] fn test_end_to_end_global_constant_in_visualise() { // Test that global constants in VISUALISE clause work correctly diff --git a/src/parser/builder.rs b/src/parser/builder.rs index f7bc9cb1..6f82f1c7 100644 --- a/src/parser/builder.rs +++ b/src/parser/builder.rs @@ -293,6 +293,11 @@ fn build_visualise_statement(node: &Node, source: &SourceTree) -> Result { // since geom definitions use internal names for their supported/required aesthetics spec.transform_aesthetics_to_internal(); + // Process annotation layers: move positional/required parameters to mappings + // This must happen AFTER transform_aesthetics_to_internal() so parameter keys are in internal space + // Returns error if arrays have mismatched lengths + spec.process_annotation_layers()?; + Ok(spec) } @@ -305,6 +310,10 @@ fn process_viz_clause(node: &Node, source: &SourceTree, spec: &mut Plot) -> Resu let layer = build_layer(&child, source)?; spec.layers.push(layer); } + "place_clause" => { + let layer = build_place_layer(&child, source)?; + spec.layers.push(layer); + } "scale_clause" => { let scale = build_scale(&child, source)?; spec.scales.push(scale); @@ -489,6 +498,22 @@ fn build_layer(node: &Node, source: &SourceTree) -> Result { Ok(layer) } +/// Build an annotation Layer from a place_clause node +/// This is similar to build_layer but marks it as an annotation layer. +/// The transformation of positional/required aesthetics from SETTING to mappings +/// happens later in Plot::transform_aesthetics_to_internal(). +/// Syntax: PLACE geom [MAPPING col AS x, ...] [SETTING param => val, ...] [FILTER condition] +fn build_place_layer(node: &Node, source: &SourceTree) -> Result { + // Build the layer using standard logic + let mut layer = build_layer(node, source)?; + + // Mark as annotation layer with initial length of 1 + // The length may be updated in process_annotation_layers() if arrays are present + layer.source = Some(DataSource::Annotation(1)); + + Ok(layer) +} + /// Parse a setting_clause: SETTING param => value, ... fn parse_setting_clause( node: &Node, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 6c80d63a..2979a51e 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -413,4 +413,53 @@ mod tests { Some(DataSource::Identifier("cte".to_string())) ); } + + #[test] + fn test_place_clause() { + let query = r#" + SELECT x, y FROM data + VISUALISE x, y + DRAW point + PLACE text SETTING x => 5, y => 10, label => 'Hello' + "#; + + let result = parse_query(query); + assert!(result.is_ok(), "Failed to parse PLACE clause: {:?}", result); + + let specs = result.unwrap(); + assert_eq!(specs.len(), 1); + assert_eq!(specs[0].layers.len(), 2, "Expected 2 layers (DRAW + PLACE)"); + + // First layer: regular DRAW point + assert_eq!(specs[0].layers[0].geom, Geom::point()); + assert!( + specs[0].layers[0].source.is_none(), + "DRAW layer should have no explicit source" + ); + + // Second layer: PLACE text with annotation source + assert_eq!(specs[0].layers[1].geom, Geom::text()); + assert!( + matches!(specs[0].layers[1].source, Some(DataSource::Annotation(_))), + "PLACE layer should have Annotation source" + ); + + // Verify positional aesthetics moved to mappings with internal names + // (transform_aesthetics_to_internal runs as part of parse_query) + assert!( + specs[0].layers[1].mappings.contains_key("pos1"), + "x should be transformed to pos1 and moved to mappings" + ); + assert!( + specs[0].layers[1].mappings.contains_key("pos2"), + "y should be transformed to pos2 and moved to mappings" + ); + + // Verify non-positional parameter (label) stays in parameters (with internal name = same) + assert_eq!( + specs[0].layers[1].parameters.get("label"), + Some(&ParameterValue::String("Hello".to_string())), + "Non-positional parameters remain in parameters" + ); + } } diff --git a/src/plot/layer/mod.rs b/src/plot/layer/mod.rs index 84076071..5c7e80ab 100644 --- a/src/plot/layer/mod.rs +++ b/src/plot/layer/mod.rs @@ -239,6 +239,9 @@ impl Layer { /// happens in Polars after query execution, before the data goes to the writer. pub fn update_mappings_for_aesthetic_columns(&mut self) { use crate::naming; + use crate::plot::aesthetic::is_positional_aesthetic; + + let is_annotation = matches!(self.source, Some(crate::DataSource::Annotation(_))); for (aesthetic, value) in self.mappings.aesthetics.iter_mut() { let aes_col_name = naming::aesthetic_column(aesthetic); @@ -256,9 +259,14 @@ impl Layer { *name = aes_col_name; } AestheticValue::Literal(_) => { - // Literals are also columns with prefixed aesthetic name - // Note: literals don't have an original_name to preserve - *value = AestheticValue::standard_column(aes_col_name); + // Literals become columns with prefixed aesthetic name + // For annotation layers, non-positional aesthetics should use identity scales + let is_positional = is_positional_aesthetic(aesthetic); + *value = if is_annotation && !is_positional { + AestheticValue::identity_column(aes_col_name) + } else { + AestheticValue::standard_column(aes_col_name) + }; } } } @@ -285,6 +293,7 @@ impl Layer { AestheticValue::Column { original_name, is_dummy, + is_scaled, .. } => { // Use the stat name from remappings as the original_name for labels @@ -293,6 +302,7 @@ impl Layer { name: prefixed_name, original_name: original_name.clone(), is_dummy: *is_dummy, + is_scaled: *is_scaled, } } AestheticValue::Literal(_) => { @@ -302,6 +312,7 @@ impl Layer { name: prefixed_name, original_name: None, is_dummy: false, + is_scaled: true, } } }; diff --git a/src/plot/main.rs b/src/plot/main.rs index a211ac34..390b37e2 100644 --- a/src/plot/main.rs +++ b/src/plot/main.rs @@ -47,6 +47,10 @@ pub use super::projection::{Coord, Projection}; // Re-export Facet types from the facet module pub use super::facet::{Facet, FacetLayout}; +// ============================================================================= +// Plot Type +// ============================================================================= + /// Complete ggsql visualization specification #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Plot { @@ -168,6 +172,7 @@ impl Plot { /// - Global mappings /// - Layer aesthetics /// - Layer remappings + /// - Layer parameters (SETTING aesthetics) /// - Scale aesthetics pub fn transform_aesthetics_to_internal(&mut self) { let ctx = self.get_aesthetic_context(); @@ -175,10 +180,21 @@ impl Plot { // Transform global mappings self.global_mappings.transform_to_internal(&ctx); - // Transform layer aesthetics and remappings + // Transform layer aesthetics, remappings, and parameters for layer in &mut self.layers { layer.mappings.transform_to_internal(&ctx); layer.remappings.transform_to_internal(&ctx); + + // Transform parameter keys from user-facing to internal names + // This ensures position aesthetics in SETTING (e.g., x => 5) become internal (pos1 => 5) + let original_parameters = std::mem::take(&mut layer.parameters); + for (param_name, value) in original_parameters { + let internal_name = ctx + .map_user_to_internal(¶m_name) + .map(|s| s.to_string()) + .unwrap_or(param_name); + layer.parameters.insert(internal_name, value); + } } // Transform scale aesthetics @@ -189,6 +205,92 @@ impl Plot { } } + /// Process annotation layers by moving positional and required aesthetics + /// from parameters to mappings, with array recycling support. + /// + /// This must be called AFTER transform_aesthetics_to_internal() so that + /// parameter keys are already in internal space (pos1, pos2, etc.). + /// + /// Array recycling rules: + /// - Scalars and length-1 arrays are recycled to match the max array length + /// - All non-1 arrays must have the same length (error on mismatch) + /// - Only required/positional aesthetics that are moved to mappings get recycled + /// + /// Positional aesthetics need to be in mappings so they go through the same + /// transformation pipeline (internal→user space) as data-mapped aesthetics, + /// enabling them to participate in scale training and coordinate transformations. + pub fn process_annotation_layers(&mut self) -> Result<(), crate::GgsqlError> { + for layer in &mut self.layers { + // Get initial length from DataSource + let mut max_length = match &layer.source { + Some(DataSource::Annotation(n)) => *n, + _ => continue, + }; + + // Step 1: Determine max array length from all parameters + let mut array_lengths: std::collections::HashMap = + std::collections::HashMap::new(); + + for (param_name, value) in &layer.parameters { + if let ParameterValue::Array(arr) = value { + let len = arr.len(); + array_lengths.insert(param_name.clone(), len); + if len > 1 && len != max_length { + if max_length > 1 { + // Multiple different non-1 lengths - error + return Err(crate::GgsqlError::ValidationError(format!( + "PLACE annotation layer has mismatched array lengths: '{}' has length {}, but another has length {}", + param_name, len, max_length + ))); + } + max_length = len; + } + } + } + + // Step 2: Move required/positional/array aesthetics to mappings with recycling + let required_aesthetics = layer.geom.aesthetics().required(); + let param_keys: Vec = layer.parameters.keys().cloned().collect(); + + for param_name in param_keys { + // Check if this is a positional aesthetic OR a required aesthetic OR an array + let is_positional = crate::plot::aesthetic::is_positional_aesthetic(¶m_name); + let is_required = required_aesthetics.contains(¶m_name.as_str()); + let is_array = matches!( + layer.parameters.get(¶m_name), + Some(ParameterValue::Array(_)) + ); + + if is_positional || is_required || is_array { + // Skip if already in mappings + if layer.mappings.contains_key(¶m_name) { + continue; + } + + // Move from parameters to mappings with recycling + if let Some(value) = layer.parameters.remove(¶m_name) { + let recycled_value = if max_length > 1 { + value.rep(max_length)? + } else { + value + }; + + layer + .mappings + .insert(¶m_name, AestheticValue::Literal(recycled_value)); + } + } + } + + // Step 3: Update DataSource with the correct length + if max_length > 1 { + layer.source = Some(DataSource::Annotation(max_length)); + } + } + + Ok(()) + } + /// Check if the spec has any layers pub fn has_layers(&self) -> bool { !self.layers.is_empty() diff --git a/src/plot/types.rs b/src/plot/types.rs index 95bfc2d6..d5955b68 100644 --- a/src/plot/types.rs +++ b/src/plot/types.rs @@ -140,6 +140,9 @@ pub enum DataSource { Identifier(String), /// File path (quoted string like 'data.csv') FilePath(String), + /// Annotation layer (PLACE clause) with number of annotation rows + /// The usize represents how many rows to generate (from array expansion) + Annotation(usize), } impl DataSource { @@ -148,6 +151,7 @@ impl DataSource { match self { DataSource::Identifier(s) => s, DataSource::FilePath(s) => s, + DataSource::Annotation(_) => "__annotation__", } } @@ -155,6 +159,11 @@ impl DataSource { pub fn is_file(&self) -> bool { matches!(self, DataSource::FilePath(_)) } + + /// Returns true if this is an annotation layer source + pub fn is_annotation(&self) -> bool { + matches!(self, DataSource::Annotation(_)) + } } // ============================================================================= @@ -173,6 +182,10 @@ pub enum AestheticValue { original_name: Option, /// Whether this is a dummy/placeholder column (e.g., for bar charts without x mapped) is_dummy: bool, + /// Whether scales should be applied to this column + /// Set to false for annotation layer literals (e.g., stroke => ['red', 'blue']) + /// that are converted to columns but should use identity scales + is_scaled: bool, }, /// Literal value (quoted string, number, or boolean) Literal(ParameterValue), @@ -185,6 +198,17 @@ impl AestheticValue { name: name.into(), original_name: None, is_dummy: false, + is_scaled: true, + } + } + + /// Create a column mapping with identity scale (no scale transformation) + pub fn identity_column(name: impl Into) -> Self { + Self::Column { + name: name.into(), + original_name: None, + is_dummy: false, + is_scaled: false, } } @@ -194,6 +218,7 @@ impl AestheticValue { name: name.into(), original_name: None, is_dummy: true, + is_scaled: true, } } @@ -206,6 +231,7 @@ impl AestheticValue { name: name.into(), original_name: Some(original_name.into()), is_dummy: false, + is_scaled: true, } } @@ -609,6 +635,78 @@ impl ArrayElement { } } + /// Homogenize a slice of array elements to a common type. + /// + /// Infers the target type from all elements, then attempts to coerce all elements + /// to that type. If coercion fails (e.g., string + number), falls back to String type. + /// + /// Returns a new vector with homogenized elements. + pub fn homogenize(values: &[Self]) -> Vec { + // Infer target type from all elements + let Some(target_type) = Self::infer_type(values) else { + // All nulls or empty array - return cloned as-is + return values.to_vec(); + }; + + // Try to coerce all elements to the inferred type + let coerced: Result, _> = values + .iter() + .map(|elem| elem.coerce_to(target_type)) + .collect(); + + match coerced { + Ok(coerced_arr) => coerced_arr, + Err(_) => { + // Coercion failed - fall back to String type + values + .iter() + .map(|elem| { + elem.coerce_to(ArrayElementType::String) + .unwrap_or(Self::Null) + }) + .collect() + } + } + } + + /// Convert this element to a SQL literal string. + /// + /// Used for generating SQL expressions from literal values. + pub fn to_sql(&self) -> String { + match self { + Self::String(s) => format!("'{}'", s.replace('\'', "''")), + Self::Number(n) => n.to_string(), + Self::Boolean(b) => { + if *b { + "TRUE".to_string() + } else { + "FALSE".to_string() + } + } + Self::Date(d) => { + // Convert days since epoch to DATE + format!("DATE '1970-01-01' + INTERVAL {} DAY", d) + } + Self::DateTime(dt) => { + // Convert microseconds since epoch to TIMESTAMP + format!( + "TIMESTAMP '1970-01-01 00:00:00' + INTERVAL {} MICROSECOND", + dt + ) + } + Self::Time(t) => { + // Convert nanoseconds since midnight to TIME + let seconds = t / 1_000_000_000; + let nanos = t % 1_000_000_000; + format!( + "TIME '00:00:00' + INTERVAL {} SECOND + INTERVAL {} NANOSECOND", + seconds, nanos + ) + } + Self::Null => "NULL".to_string(), + } + } + /// Get the type name for error messages. fn type_name(&self) -> &'static str { match self { @@ -788,6 +886,83 @@ impl ParameterValue { _ => None, } } + + /// Convert this parameter value to a SQL literal string. + /// + /// For arrays, generates a CASE WHEN statement that selects array elements by row number + /// using the __ggsql_dummy__ column. + pub fn to_sql(&self) -> String { + match self { + ParameterValue::String(s) => format!("'{}'", s.replace('\'', "''")), + ParameterValue::Number(n) => n.to_string(), + ParameterValue::Boolean(b) => { + if *b { + "TRUE".to_string() + } else { + "FALSE".to_string() + } + } + ParameterValue::Array(arr) => { + let mut case_stmt = String::from("CASE __ggsql_dummy__"); + for (i, elem) in arr.iter().enumerate() { + let row_num = i + 1; + let value_sql = elem.to_sql(); + case_stmt.push_str(&format!(" WHEN {} THEN {}", row_num, value_sql)); + } + case_stmt.push_str(" END"); + case_stmt + } + ParameterValue::Null => "NULL".to_string(), + } + } + + /// Convert a scalar ParameterValue to an ArrayElement. + /// + /// Panics if called on an Array variant. + fn to_array_element(&self) -> ArrayElement { + match self { + ParameterValue::Number(num) => ArrayElement::Number(*num), + ParameterValue::String(s) => ArrayElement::String(s.clone()), + ParameterValue::Boolean(b) => ArrayElement::Boolean(*b), + ParameterValue::Null => ArrayElement::Null, + ParameterValue::Array(_) => panic!("Cannot convert Array to single ArrayElement"), + } + } + + /// Recycle this value to a target array length. + /// + /// - Scalars (String, Number, Boolean, Null) are converted to arrays with n copies + /// - Arrays of length 1 are recycled to n copies of that element + /// - Arrays of target length are returned as-is (after homogenization) + /// - Arrays of other lengths produce an error + pub fn rep(self, n: usize) -> Result { + match self { + // Arrays: homogenize types if mixed, then recycle if needed + ParameterValue::Array(arr) => { + if arr.len() == 1 { + // Recycle the single element + let element = arr[0].clone(); + Ok(ParameterValue::Array(vec![element; n])) + } else if arr.len() == n { + // Already correct length - homogenize for type consistency + let arr = ArrayElement::homogenize(&arr); + Ok(ParameterValue::Array(arr)) + } else { + // Mismatched length - shouldn't happen if validation passed + Err(crate::GgsqlError::InternalError(format!( + "Attempted to recycle array of length {} to length {} (should have been caught earlier)", + arr.len(), + n + ))) + } + } + // Scalars: convert to ArrayElement and replicate n times + scalar => { + let elem = scalar.to_array_element(); + Ok(ParameterValue::Array(vec![elem; n])) + } + } + } } // ============================================================================= @@ -1366,4 +1541,26 @@ mod tests { let result = elem.coerce_to(ArrayElementType::Date); assert!(result.is_err()); } + + #[test] + fn test_homogenize_mixed_number_string() { + let arr = vec![ + ArrayElement::Number(1.0), + ArrayElement::String("foo".to_string()), + ]; + + let homogenized = ArrayElement::homogenize(&arr); + + // Should fall back to String type since "foo" can't be coerced to Number + assert_eq!(homogenized.len(), 2); + assert!(matches!(homogenized[0], ArrayElement::String(_))); + assert!(matches!(homogenized[1], ArrayElement::String(_))); + + if let ArrayElement::String(s) = &homogenized[0] { + assert_eq!(s, "1"); + } + if let ArrayElement::String(s) = &homogenized[1] { + assert_eq!(s, "foo"); + } + } } diff --git a/src/writer/vegalite/data.rs b/src/writer/vegalite/data.rs index 4b1c89e3..a19b7e36 100644 --- a/src/writer/vegalite/data.rs +++ b/src/writer/vegalite/data.rs @@ -97,16 +97,8 @@ pub(super) fn series_value_at(series: &Series, idx: usize) -> Result { let ca = series .str() .map_err(|e| GgsqlError::WriterError(format!("Failed to cast to string: {}", e)))?; - // Try to parse as number if it looks numeric - if let Some(val) = ca.get(idx) { - if let Ok(num) = val.parse::() { - Ok(json!(num)) - } else { - Ok(json!(val)) - } - } else { - Ok(Value::Null) - } + // Keep strings as strings (don't parse to numbers) + Ok(ca.get(idx).map(|v| json!(v)).unwrap_or(Value::Null)) } Date => { // Convert days since epoch to ISO date string: "YYYY-MM-DD" diff --git a/src/writer/vegalite/encoding.rs b/src/writer/vegalite/encoding.rs index f91ace3a..d6f53d2f 100644 --- a/src/writer/vegalite/encoding.rs +++ b/src/writer/vegalite/encoding.rs @@ -802,7 +802,8 @@ pub(super) fn build_encoding_channel( name: col, original_name, is_dummy, - } => build_column_encoding(aesthetic, col, original_name, *is_dummy, ctx), + is_scaled, + } => build_column_encoding(aesthetic, col, original_name, *is_dummy, *is_scaled, ctx), AestheticValue::Literal(lit) => build_literal_encoding(aesthetic, lit), } } @@ -813,13 +814,14 @@ fn build_column_encoding( col: &str, original_name: &Option, is_dummy: bool, + is_scaled: bool, ctx: &mut EncodingContext, ) -> Result { let aesthetic_ctx = ctx.spec.get_aesthetic_context(); let primary = aesthetic_ctx .primary_internal_positional(aesthetic) .unwrap_or(aesthetic); - let mut identity_scale = false; + let mut identity_scale = !is_scaled; // Determine field type from scale or infer from data let field_type = determine_field_type_for_aesthetic( @@ -937,6 +939,12 @@ fn build_literal_encoding(aesthetic: &str, lit: &ParameterValue) -> Result json!(n), } } + ParameterValue::Array(_) => { + return Err(crate::GgsqlError::WriterError(format!( + "The `{aes}` SETTING must be scalar, not an array.", + aes = aesthetic + ))) + } _ => lit.to_json(), }; Ok(json!({"value": val})) diff --git a/tree-sitter-ggsql/grammar.js b/tree-sitter-ggsql/grammar.js index 9302f195..dc1a8487 100644 --- a/tree-sitter-ggsql/grammar.js +++ b/tree-sitter-ggsql/grammar.js @@ -442,6 +442,7 @@ module.exports = grammar({ // All the visualization clauses (same as current grammar) viz_clause: $ => choice( $.draw_clause, + $.place_clause, $.scale_clause, $.facet_clause, $.project_clause, @@ -461,6 +462,14 @@ module.exports = grammar({ optional($.order_clause) ), + // PLACE clause - syntax: PLACE geom [SETTING ...] + // For annotation layers with literal values only (no data mappings) + place_clause: $ => seq( + caseInsensitive('PLACE'), + $.geom_type, + optional($.setting_clause) + ), + // REMAPPING clause: maps stat-computed columns to aesthetics // Syntax: REMAPPING count AS y, sum AS size // Reuses mapping_list for parsing - stat names are treated as column references