diff --git a/docs/query-data/udf/java-user-defined-function.md b/docs/query-data/udf/java-user-defined-function.md
index 6c18a25c9e3b5..30ff90aedee33 100644
--- a/docs/query-data/udf/java-user-defined-function.md
+++ b/docs/query-data/udf/java-user-defined-function.md
@@ -137,7 +137,8 @@ When writing a UDF in Java, the main entry point must be the `evaluate` function
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -344,7 +345,8 @@ public void destroy(State state) {
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -386,7 +388,8 @@ void reset(State state)
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -431,7 +434,8 @@ Like UDF, UDTF requires you to implement the `evaluate` method, but the return v
"file"="file:///pathTo/java-udtf.jar",
"symbol"="org.apache.doris.udf.demo.UDTFStringTest",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -527,7 +531,8 @@ The steps are as follows:
"file"="file:///pathTo/FunctionUdf.jar",
"symbol"="org.apache.doris.udf.FunctionUdf",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
diff --git a/docs/query-data/udf/python-user-defined-function.md b/docs/query-data/udf/python-user-defined-function.md
index 463dd549d0f88..228f61a7a8975 100644
--- a/docs/query-data/udf/python-user-defined-function.md
+++ b/docs/query-data/udf/python-user-defined-function.md
@@ -81,7 +81,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "entry_function_name",
"runtime_version" = "python_version",
- "always_nullable" = "true|false"
+ "always_nullable" = "true|false",
+ "volatility" = "immutable|stable|volatile"
)
AS $$
def entry_function_name(param1, param2, ...):
@@ -100,7 +101,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -120,7 +122,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -293,7 +296,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -303,7 +307,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -313,7 +318,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -331,7 +337,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -341,7 +348,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -351,7 +359,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -368,7 +377,8 @@ PROPERTIES (
"file" = "file:///path/to/my_udf.zip",
"symbol" = "my_udf.math_ops.multiply_by_two",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -411,6 +421,7 @@ DROP FUNCTION IF EXISTS py_is_prime(INT);
| `file` | No | - | Path to the Python `.zip` package. Required only in module mode. Supports three protocols:
• `file://`: local file system path
• `http://`: HTTP remote download
• `https://`: HTTPS remote download |
| `runtime_version` | Yes | - | Python runtime version, such as `"3.10.12"`. The complete version number is required |
| `always_nullable` | No | `true` | Whether the function always returns a nullable result |
+| `volatility` | No | `volatile` | Function volatility. Valid values are `immutable`, `stable`, and `volatile`.
`immutable`: the same input always produces the same output. Most deterministic UDFs should use this value for better plan optimization.
`stable`: the result is stable within one SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled.
`volatile`: the result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled. |
#### Runtime Version Notes
@@ -478,7 +489,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -529,7 +541,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "add",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -552,7 +565,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "to_upper",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -575,7 +589,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "sqrt",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -615,7 +630,8 @@ PROPERTIES (
"type"="PYTHON_UDF",
"symbol"="py_mixed_vector_add_impl",
"always_nullable"="true",
- "runtime_version"="3.12.11"
+ "runtime_version"="3.12.11",
+ "volatility"="immutable"
)
AS $$
import pandas as pd
@@ -701,7 +717,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -725,7 +742,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -750,7 +768,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m):
@@ -774,7 +793,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m, key):
@@ -799,7 +819,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s):
@@ -826,7 +847,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(email):
@@ -852,7 +874,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -889,7 +912,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime
@@ -918,7 +942,8 @@ RETURNS BOOLEAN
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(id_card):
@@ -1155,7 +1180,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "SumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class SumUDAF:
@@ -1222,7 +1248,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "AvgUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class AvgUDAF:
@@ -1428,7 +1455,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.VarianceUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_stddev(DOUBLE)
@@ -1438,7 +1466,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.StdDevUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_median(DOUBLE)
@@ -1448,7 +1477,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.MedianUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -1530,6 +1560,7 @@ DROP FUNCTION IF EXISTS py_variance(DOUBLE);
| `file` | No | - | Path to the Python `.zip` package. Required only in module mode. Supports three protocols:
• `file://`: local file system path
• `http://`: HTTP remote download
• `https://`: HTTPS remote download |
| `runtime_version` | Yes | - | Python runtime version, such as `"3.10.12"` |
| `always_nullable` | No | `true` | Whether the function always returns a nullable result |
+| `volatility` | No | `immutable` | Function volatility. Valid values are `immutable`, `stable`, and `volatile`.
`immutable`: the same input always produces the same output. Most deterministic UDFs should use this value for better plan optimization.
`stable`: the result is stable within one SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled.
`volatile`: the result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled. |
#### runtime_version Notes
@@ -1572,7 +1603,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "RunningSumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class RunningSumUDAF:
@@ -1648,7 +1680,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "PercentileUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class PercentileUDAF:
@@ -1743,7 +1776,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "CollectSetUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class CollectSetUDAF:
@@ -2058,7 +2092,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "split_string_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def split_string_udtf(text, delimiter):
@@ -2094,7 +2129,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "generate_series_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def generate_series_udtf(start, end):
@@ -2146,7 +2182,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "duplicate_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def duplicate_udtf(text, n):
@@ -2180,7 +2217,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "cartesian_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def cartesian_udtf(list1, list2):
@@ -2220,7 +2258,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "explode_json_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import json
@@ -2382,7 +2421,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.split_lines_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_extract_emails(STRING)
@@ -2392,7 +2432,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.extract_emails_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_parse_json(STRING)
@@ -2402,7 +2443,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.parse_json_object_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_expand_json(STRING)
@@ -2412,7 +2454,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.expand_json_array_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_ngram(STRING, INT)
@@ -2422,7 +2465,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.ngram_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -2531,6 +2575,7 @@ CREATE TABLES FUNCTION py_split(STRING, STRING) ...;
| `file` | No | - | Path to the Python `.zip` package. Required only in module mode. Supports three protocols:
• `file://`: local file system path
• `http://`: HTTP remote download
• `https://`: HTTPS remote download |
| `runtime_version` | Yes | - | Python runtime version, such as `"3.10.12"` |
| `always_nullable` | No | `true` | Whether the function always returns a nullable result |
+| `volatility` | No | `immutable` | Function volatility. Valid values are `immutable`, `stable`, and `volatile`.
`immutable`: the same input always produces the same output. Most deterministic UDFs should use this value for better plan optimization.
`stable`: the result is stable within one SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled.
`volatile`: the result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled. |
#### runtime_version Notes
@@ -2562,7 +2607,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_csv_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def parse_csv_udtf(csv_data):
@@ -2605,7 +2651,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "date_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime, timedelta
@@ -2653,7 +2700,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "tokenize_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import re
@@ -2696,7 +2744,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_url_params_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from urllib.parse import urlparse, parse_qs
@@ -2742,7 +2791,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "expand_ip_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def expand_ip_range_udtf(start_ip, end_ip):
@@ -3099,7 +3149,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- Must specify the complete version number to match Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(x):
@@ -3201,7 +3252,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.9.18", -- Must specify the complete version number to match Python 3.9.18
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(text):
@@ -3215,7 +3267,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- Must specify the complete version number to match Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import numpy as np
diff --git a/docs/sql-manual/sql-statements/function/CREATE-FUNCTION.md b/docs/sql-manual/sql-statements/function/CREATE-FUNCTION.md
index 1de8ba51cb1c6..095fbf350992d 100644
--- a/docs/sql-manual/sql-statements/function/CREATE-FUNCTION.md
+++ b/docs/sql-manual/sql-statements/function/CREATE-FUNCTION.md
@@ -75,6 +75,10 @@ CREATE [ GLOBAL ]
> - `symbol`: Indicates the class name containing the UDF class. This parameter is mandatory.
> - `type`: Indicates the UDF call type. The default is Native. Use JAVA_UDF when using a Java UDF.
> - `always_nullable`: Indicates whether the UDF result may contain NULL values. This is an optional parameter with a default value of true.
+> - `volatility`: Indicates how stable the function result is. This is an optional parameter. The default value is `volatile` for scalar UDFs, and `immutable` for UDAF/UDTF. Valid values are:
+> - `immutable`: The same input always produces the same output. Most deterministic UDFs should be marked as `immutable` so that the optimizer can apply more plan optimizations.
+> - `stable`: The result is stable within a single SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled for this type of function.
+> - `volatile`: The result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled for this type of function.
## Access Control Requirements
@@ -91,7 +95,8 @@ To execute this command, the user must have `ADMIN_PRIV` privileges.
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -135,4 +140,92 @@ To execute this command, the user must have `ADMIN_PRIV` privileges.
```sql
CREATE GLOBAL ALIAS FUNCTION id_masking(INT) WITH PARAMETER(id) AS CONCAT(LEFT(id, 3), '****', RIGHT(id, 4));
- ```
\ No newline at end of file
+ ```
+
+6. Create a volatile Python UDF. Functions such as `uuid.uuid4()` that depend on randomness should keep the default `volatility = volatile` and must not be incorrectly marked as `immutable`.
+
+ ```sql
+ CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num" = "1");
+ INSERT INTO cte_uuid_seed VALUES (1),(2),(3);
+
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "volatile"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+
+ SET enable_cte_materialize = true;
+ SET inline_cte_referenced_threshold = 10;
+
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ Correct result:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 1 |
+ | 2 | 1 |
+ | 3 | 1 |
+ +------+-----------------+
+ ```
+
+ For this function, the following definition is incorrect:
+
+ ```sql
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "immutable"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+ ```
+
+ Run the same query again:
+
+ ```sql
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ Incorrect result:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 2 |
+ | 2 | 2 |
+ | 3 | 2 |
+ +------+-----------------+
+ ```
+
+ Why this is wrong:
+ Because `py_uuid_token` is volatile, each call to `uuid.uuid4()` generates a new value. If the function is incorrectly marked as `volatility = immutable`, the optimizer may treat repeated references as safe to rewrite and may choose a plan that evaluates the UDF separately on both sides of `UNION ALL`. As a result, the same `id` can produce two different `token` values, and `COUNT(DISTINCT token)` changes from `1` to `2`.
\ No newline at end of file
diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/java-user-defined-function.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/java-user-defined-function.md
index 93bc319885ba1..838d1de464894 100644
--- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/java-user-defined-function.md
+++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/java-user-defined-function.md
@@ -137,7 +137,8 @@ insert into test_table values (6, 666.66, "d,e");
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -344,7 +345,8 @@ public void destroy(State state) {
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -386,7 +388,8 @@ void reset(State state)
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -431,7 +434,8 @@ UDTF 与 UDF 一样需要用户实现 `evaluate` 方法,但 UDTF 的返回值
"file"="file:///pathTo/java-udtf.jar",
"symbol"="org.apache.doris.udf.demo.UDTFStringTest",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -527,7 +531,8 @@ public class FunctionUdf {
"file"="file:///pathTo/FunctionUdf.jar",
"symbol"="org.apache.doris.udf.FunctionUdf",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/python-user-defined-function.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/python-user-defined-function.md
index 9f4026404516c..d0b2c2f37513c 100644
--- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/python-user-defined-function.md
+++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-data/udf/python-user-defined-function.md
@@ -81,7 +81,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "entry_function_name",
"runtime_version" = "python_version",
- "always_nullable" = "true|false"
+ "always_nullable" = "true|false",
+ "volatility" = "immutable|stable|volatile"
)
AS $$
def entry_function_name(param1, param2, ...):
@@ -100,7 +101,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -120,7 +122,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -293,7 +296,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -303,7 +307,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -313,7 +318,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -331,7 +337,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -341,7 +348,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -351,7 +359,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -368,7 +377,8 @@ PROPERTIES (
"file" = "file:///path/to/my_udf.zip",
"symbol" = "my_udf.math_ops.multiply_by_two",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -411,6 +421,7 @@ DROP FUNCTION IF EXISTS py_is_prime(INT);
| `file` | 否 | - | Python `.zip` 包路径,仅模块模式需要。支持三种协议:
• `file://` —— 本地文件系统路径
• `http://` —— HTTP 远程下载
• `https://` —— HTTPS 远程下载 |
| `runtime_version` | 是 | - | Python 运行时版本,如 `"3.10.12"`,需填写完整的版本号 |
| `always_nullable` | 否 | `true` | 是否总是返回可空结果 |
+| `volatility` | 否 | `volatile` | 函数易变性。可选值为 `immutable`、`stable`、`volatile`。
`immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样计划优化效果更好。
`stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
`volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。 |
#### 运行时版本说明
@@ -478,7 +489,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -529,7 +541,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "add",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -552,7 +565,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "to_upper",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -575,7 +589,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "sqrt",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -615,7 +630,8 @@ PROPERTIES (
"type"="PYTHON_UDF",
"symbol"="py_mixed_vector_add_impl",
"always_nullable"="true",
- "runtime_version"="3.12.11"
+ "runtime_version"="3.12.11",
+ "volatility"="immutable"
)
AS $$
import pandas as pd
@@ -701,7 +717,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -725,7 +742,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -750,7 +768,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m):
@@ -774,7 +793,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m, key):
@@ -799,7 +819,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s):
@@ -826,7 +847,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(email):
@@ -852,7 +874,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -889,7 +912,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime
@@ -918,7 +942,8 @@ RETURNS BOOLEAN
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(id_card):
@@ -1155,7 +1180,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "SumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class SumUDAF:
@@ -1222,7 +1248,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "AvgUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class AvgUDAF:
@@ -1428,7 +1455,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.VarianceUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_stddev(DOUBLE)
@@ -1438,7 +1466,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.StdDevUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_median(DOUBLE)
@@ -1448,7 +1477,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.MedianUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -1530,6 +1560,7 @@ DROP FUNCTION IF EXISTS py_variance(DOUBLE);
| `file` | 否 | - | Python `.zip` 包路径,仅模块模式需要。支持三种协议:
• `file://` —— 本地文件系统路径
• `http://` —— HTTP 远程下载
• `https://` —— HTTPS 远程下载 |
| `runtime_version` | 是 | - | Python 运行时版本,如 `"3.10.12"` |
| `always_nullable` | 否 | `true` | 是否总是返回可空结果 |
+| `volatility` | 否 | `immutable` | 函数易变性。可选值为 `immutable`、`stable`、`volatile`。
`immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样计划优化效果更好。
`stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
`volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。 |
#### runtime_version 说明
@@ -1572,7 +1603,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "RunningSumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class RunningSumUDAF:
@@ -1648,7 +1680,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "PercentileUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class PercentileUDAF:
@@ -1743,7 +1776,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "CollectSetUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class CollectSetUDAF:
@@ -2058,7 +2092,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "split_string_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def split_string_udtf(text, delimiter):
@@ -2094,7 +2129,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "generate_series_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def generate_series_udtf(start, end):
@@ -2146,7 +2182,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "duplicate_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def duplicate_udtf(text, n):
@@ -2180,7 +2217,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "cartesian_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def cartesian_udtf(list1, list2):
@@ -2220,7 +2258,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "explode_json_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import json
@@ -2382,7 +2421,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.split_lines_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_extract_emails(STRING)
@@ -2392,7 +2432,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.extract_emails_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_parse_json(STRING)
@@ -2402,7 +2443,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.parse_json_object_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_expand_json(STRING)
@@ -2412,7 +2454,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.expand_json_array_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_ngram(STRING, INT)
@@ -2422,7 +2465,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.ngram_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -2531,6 +2575,7 @@ CREATE TABLES FUNCTION py_split(STRING, STRING) ...;
| `file` | 否 | - | Python `.zip` 包路径,仅模块模式需要。支持三种协议:
• `file://` —— 本地文件系统路径
• `http://` —— HTTP 远程下载
• `https://` —— HTTPS 远程下载 |
| `runtime_version` | 是 | - | Python 运行时版本,如 `"3.10.12"` |
| `always_nullable` | 否 | `true` | 是否总是返回可空结果 |
+| `volatility` | 否 | `immutable` | 函数易变性。可选值为 `immutable`、`stable`、`volatile`。
`immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样计划优化效果更好。
`stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
`volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。 |
#### runtime_version 说明
@@ -2562,7 +2607,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_csv_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def parse_csv_udtf(csv_data):
@@ -2605,7 +2651,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "date_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime, timedelta
@@ -2653,7 +2700,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "tokenize_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import re
@@ -2696,7 +2744,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_url_params_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from urllib.parse import urlparse, parse_qs
@@ -2742,7 +2791,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "expand_ip_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def expand_ip_range_udtf(start_ip, end_ip):
@@ -3099,7 +3149,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- 必须指定完整版本号,匹配 Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(x):
@@ -3201,7 +3252,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.9.18", -- 必须指定完整版本号,匹配 Python 3.9.18
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(text):
@@ -3215,7 +3267,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- 必须指定完整版本号,匹配 Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import numpy as np
diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/function/CREATE-FUNCTION.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/function/CREATE-FUNCTION.md
index f398af2dce173..3e222218bd75d 100644
--- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/function/CREATE-FUNCTION.md
+++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/function/CREATE-FUNCTION.md
@@ -74,6 +74,10 @@ CREATE [ GLOBAL ]
> - `symbol`: 表示的是包含 UDF 类的类名。这个参数是必须设定的
> - `type`: 表示的 UDF 调用类型,默认为 Native,使用 Java UDF 时传 JAVA_UDF。
> - `always_nullable`:表示的 UDF 返回结果中是否有可能出现 NULL 值,是可选参数,默认值为 true。
+> - `volatility`:表示函数结果的稳定性,是可选参数。标量 UDF 的默认值为 `volatile`,UDAF/UDTF 的默认值为 `immutable`。可选值为:
+> - `immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样优化器可以应用更多计划优化。
+> - `stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
+> - `volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。
## 权限控制
@@ -88,7 +92,8 @@ CREATE [ GLOBAL ]
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -124,4 +129,92 @@ CREATE [ GLOBAL ]
```sql
CREATE GLOBAL ALIAS FUNCTION id_masking(INT) WITH PARAMETER(id) AS CONCAT(LEFT(id, 3), '****', RIGHT(id, 4));
- ```
\ No newline at end of file
+ ```
+
+6. 创建一个 volatile Python UDF。像 `uuid.uuid4()` 这类依赖随机数的函数,应保持 `volatility` 的默认值 `volatile`,不要错误标记为 `immutable`。
+
+ ```sql
+ CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num" = "1");
+ INSERT INTO cte_uuid_seed VALUES (1),(2),(3);
+
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "volatile"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+
+ SET enable_cte_materialize = true;
+ SET inline_cte_referenced_threshold = 10;
+
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ 正确结果:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 1 |
+ | 2 | 1 |
+ | 3 | 1 |
+ +------+-----------------+
+ ```
+
+ 对于上述函数,不应写成下面这样:
+
+ ```sql
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "immutable"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+ ```
+
+ 重新执行同一条查询:
+
+ ```sql
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ 错误结果:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 2 |
+ | 2 | 2 |
+ | 3 | 2 |
+ +------+-----------------+
+ ```
+
+ 错误原因:
+ `py_uuid_token` 是 volatile 函数,`uuid.uuid4()` 每次调用都会生成新值。如果错误地将它标记为 `volatility = immutable`,优化器可能会把重复引用视为可安全改写,并选择在 `UNION ALL` 两侧分别执行 UDF 的计划。这样同一个 `id` 会生成两个不同的 `token`,`COUNT(DISTINCT token)` 就会从 `1` 变成 `2`。
\ No newline at end of file
diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/java-user-defined-function.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/java-user-defined-function.md
index 93bc319885ba1..838d1de464894 100644
--- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/java-user-defined-function.md
+++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/java-user-defined-function.md
@@ -137,7 +137,8 @@ insert into test_table values (6, 666.66, "d,e");
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -344,7 +345,8 @@ public void destroy(State state) {
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -386,7 +388,8 @@ void reset(State state)
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -431,7 +434,8 @@ UDTF 与 UDF 一样需要用户实现 `evaluate` 方法,但 UDTF 的返回值
"file"="file:///pathTo/java-udtf.jar",
"symbol"="org.apache.doris.udf.demo.UDTFStringTest",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -527,7 +531,8 @@ public class FunctionUdf {
"file"="file:///pathTo/FunctionUdf.jar",
"symbol"="org.apache.doris.udf.FunctionUdf",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/python-user-defined-function.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/python-user-defined-function.md
index 9f4026404516c..b6a8842ba5b0d 100644
--- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/python-user-defined-function.md
+++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/query-data/udf/python-user-defined-function.md
@@ -81,7 +81,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "entry_function_name",
"runtime_version" = "python_version",
- "always_nullable" = "true|false"
+ "always_nullable" = "true|false",
+ "volatility" = "immutable|stable|volatile"
)
AS $$
def entry_function_name(param1, param2, ...):
@@ -100,7 +101,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -120,7 +122,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -293,7 +296,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -303,7 +307,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -313,7 +318,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -331,7 +337,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -341,7 +348,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -351,7 +359,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -368,7 +377,8 @@ PROPERTIES (
"file" = "file:///path/to/my_udf.zip",
"symbol" = "my_udf.math_ops.multiply_by_two",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -411,6 +421,7 @@ DROP FUNCTION IF EXISTS py_is_prime(INT);
| `file` | 否 | - | Python `.zip` 包路径,仅模块模式需要。支持三种协议:
• `file://` —— 本地文件系统路径
• `http://` —— HTTP 远程下载
• `https://` —— HTTPS 远程下载 |
| `runtime_version` | 是 | - | Python 运行时版本,如 `"3.10.12"`,需填写完整的版本号 |
| `always_nullable` | 否 | `true` | 是否总是返回可空结果 |
+| `volatility` | 否 | `volatile` | 函数易变性,自 4.1.2 起支持。可选值为 `immutable`、`stable`、`volatile`。
`immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样计划优化效果更好。
`stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
`volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。 |
#### 运行时版本说明
@@ -478,7 +489,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -529,7 +541,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "add",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -552,7 +565,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "to_upper",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -575,7 +589,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "sqrt",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -615,7 +630,8 @@ PROPERTIES (
"type"="PYTHON_UDF",
"symbol"="py_mixed_vector_add_impl",
"always_nullable"="true",
- "runtime_version"="3.12.11"
+ "runtime_version"="3.12.11",
+ "volatility"="immutable"
)
AS $$
import pandas as pd
@@ -701,7 +717,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -725,7 +742,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -750,7 +768,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m):
@@ -774,7 +793,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m, key):
@@ -799,7 +819,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s):
@@ -826,7 +847,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(email):
@@ -852,7 +874,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -889,7 +912,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime
@@ -918,7 +942,8 @@ RETURNS BOOLEAN
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(id_card):
@@ -1155,7 +1180,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "SumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class SumUDAF:
@@ -1222,7 +1248,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "AvgUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class AvgUDAF:
@@ -1428,7 +1455,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.VarianceUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_stddev(DOUBLE)
@@ -1438,7 +1466,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.StdDevUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_median(DOUBLE)
@@ -1448,7 +1477,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.MedianUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -1530,6 +1560,7 @@ DROP FUNCTION IF EXISTS py_variance(DOUBLE);
| `file` | 否 | - | Python `.zip` 包路径,仅模块模式需要。支持三种协议:
• `file://` —— 本地文件系统路径
• `http://` —— HTTP 远程下载
• `https://` —— HTTPS 远程下载 |
| `runtime_version` | 是 | - | Python 运行时版本,如 `"3.10.12"` |
| `always_nullable` | 否 | `true` | 是否总是返回可空结果 |
+| `volatility` | 否 | `immutable` | 函数易变性,自 4.1.2 起支持。可选值为 `immutable`、`stable`、`volatile`。
`immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样计划优化效果更好。
`stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
`volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。 |
#### runtime_version 说明
@@ -1572,7 +1603,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "RunningSumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class RunningSumUDAF:
@@ -1648,7 +1680,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "PercentileUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class PercentileUDAF:
@@ -1743,7 +1776,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "CollectSetUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class CollectSetUDAF:
@@ -2058,7 +2092,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "split_string_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def split_string_udtf(text, delimiter):
@@ -2094,7 +2129,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "generate_series_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def generate_series_udtf(start, end):
@@ -2146,7 +2182,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "duplicate_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def duplicate_udtf(text, n):
@@ -2180,7 +2217,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "cartesian_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def cartesian_udtf(list1, list2):
@@ -2220,7 +2258,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "explode_json_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import json
@@ -2382,7 +2421,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.split_lines_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_extract_emails(STRING)
@@ -2392,7 +2432,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.extract_emails_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_parse_json(STRING)
@@ -2402,7 +2443,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.parse_json_object_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_expand_json(STRING)
@@ -2412,7 +2454,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.expand_json_array_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_ngram(STRING, INT)
@@ -2422,7 +2465,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.ngram_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -2531,6 +2575,7 @@ CREATE TABLES FUNCTION py_split(STRING, STRING) ...;
| `file` | 否 | - | Python `.zip` 包路径,仅模块模式需要。支持三种协议:
• `file://` —— 本地文件系统路径
• `http://` —— HTTP 远程下载
• `https://` —— HTTPS 远程下载 |
| `runtime_version` | 是 | - | Python 运行时版本,如 `"3.10.12"` |
| `always_nullable` | 否 | `true` | 是否总是返回可空结果 |
+| `volatility` | 否 | `immutable` | 函数易变性,自 4.1.2 起支持。可选值为 `immutable`、`stable`、`volatile`。
`immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样计划优化效果更好。
`stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
`volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。 |
#### runtime_version 说明
@@ -2562,7 +2607,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_csv_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def parse_csv_udtf(csv_data):
@@ -2605,7 +2651,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "date_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime, timedelta
@@ -2653,7 +2700,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "tokenize_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import re
@@ -2696,7 +2744,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_url_params_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from urllib.parse import urlparse, parse_qs
@@ -2742,7 +2791,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "expand_ip_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def expand_ip_range_udtf(start_ip, end_ip):
@@ -3099,7 +3149,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- 必须指定完整版本号,匹配 Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(x):
@@ -3201,7 +3252,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.9.18", -- 必须指定完整版本号,匹配 Python 3.9.18
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(text):
@@ -3215,7 +3267,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- 必须指定完整版本号,匹配 Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import numpy as np
diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md
index f398af2dce173..c7f95c64bdea3 100644
--- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md
+++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md
@@ -74,6 +74,10 @@ CREATE [ GLOBAL ]
> - `symbol`: 表示的是包含 UDF 类的类名。这个参数是必须设定的
> - `type`: 表示的 UDF 调用类型,默认为 Native,使用 Java UDF 时传 JAVA_UDF。
> - `always_nullable`:表示的 UDF 返回结果中是否有可能出现 NULL 值,是可选参数,默认值为 true。
+> - `volatility`:表示函数结果的稳定性,自 4.1.2 起支持。该参数是可选参数。标量 UDF 的默认值为 `volatile`,UDAF/UDTF 的默认值为 `immutable`。可选值为:
+> - `immutable`:相同输入始终产生相同输出。大部分确定性 UDF 建议设置为 `immutable`,这样优化器可以应用更多计划优化。
+> - `stable`:在同一条 SQL 语句内结果稳定,但不同语句之间可能变化,类似 `now()`。这类函数会禁用 SQL Cache 和物化视图改写。
+> - `volatile`:每次调用结果都可能变化,类似 `random()`。这类函数会禁用 SQL Cache、物化视图改写以及很多优化器改写规则。
## 权限控制
@@ -88,7 +92,8 @@ CREATE [ GLOBAL ]
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -124,4 +129,92 @@ CREATE [ GLOBAL ]
```sql
CREATE GLOBAL ALIAS FUNCTION id_masking(INT) WITH PARAMETER(id) AS CONCAT(LEFT(id, 3), '****', RIGHT(id, 4));
- ```
\ No newline at end of file
+ ```
+
+6. 创建一个 volatile Python UDF。像 `uuid.uuid4()` 这类依赖随机数的函数,应保持 `volatility` 的默认值 `volatile`,不要错误标记为 `immutable`。
+
+ ```sql
+ CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num" = "1");
+ INSERT INTO cte_uuid_seed VALUES (1),(2),(3);
+
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "volatile"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+
+ SET enable_cte_materialize = true;
+ SET inline_cte_referenced_threshold = 10;
+
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ 正确结果:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 1 |
+ | 2 | 1 |
+ | 3 | 1 |
+ +------+-----------------+
+ ```
+
+ 对于上述函数,不应写成下面这样:
+
+ ```sql
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "immutable"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+ ```
+
+ 重新执行同一条查询:
+
+ ```sql
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ 错误结果:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 2 |
+ | 2 | 2 |
+ | 3 | 2 |
+ +------+-----------------+
+ ```
+
+ 错误原因:
+ `py_uuid_token` 是 volatile 函数,`uuid.uuid4()` 每次调用都会生成新值。如果错误地将它标记为 `volatility = immutable`,优化器可能会把重复引用视为可安全改写,并选择在 `UNION ALL` 两侧分别执行 UDF 的计划。这样同一个 `id` 会生成两个不同的 `token`,`COUNT(DISTINCT token)` 就会从 `1` 变成 `2`。
\ No newline at end of file
diff --git a/versioned_docs/version-4.x/query-data/udf/java-user-defined-function.md b/versioned_docs/version-4.x/query-data/udf/java-user-defined-function.md
index 6c18a25c9e3b5..30ff90aedee33 100644
--- a/versioned_docs/version-4.x/query-data/udf/java-user-defined-function.md
+++ b/versioned_docs/version-4.x/query-data/udf/java-user-defined-function.md
@@ -137,7 +137,8 @@ When writing a UDF in Java, the main entry point must be the `evaluate` function
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -344,7 +345,8 @@ public void destroy(State state) {
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -386,7 +388,8 @@ void reset(State state)
"file"="file:///pathTo/java-udaf.jar",
"symbol"="org.apache.doris.udf.SimpleDemo",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -431,7 +434,8 @@ Like UDF, UDTF requires you to implement the `evaluate` method, but the return v
"file"="file:///pathTo/java-udtf.jar",
"symbol"="org.apache.doris.udf.demo.UDTFStringTest",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -527,7 +531,8 @@ The steps are as follows:
"file"="file:///pathTo/FunctionUdf.jar",
"symbol"="org.apache.doris.udf.FunctionUdf",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
diff --git a/versioned_docs/version-4.x/query-data/udf/python-user-defined-function.md b/versioned_docs/version-4.x/query-data/udf/python-user-defined-function.md
index 463dd549d0f88..de2303edd9d30 100644
--- a/versioned_docs/version-4.x/query-data/udf/python-user-defined-function.md
+++ b/versioned_docs/version-4.x/query-data/udf/python-user-defined-function.md
@@ -81,7 +81,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "entry_function_name",
"runtime_version" = "python_version",
- "always_nullable" = "true|false"
+ "always_nullable" = "true|false",
+ "volatility" = "immutable|stable|volatile"
)
AS $$
def entry_function_name(param1, param2, ...):
@@ -100,7 +101,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -120,7 +122,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -293,7 +296,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -303,7 +307,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -313,7 +318,8 @@ PROPERTIES (
"file" = "file:///path/to/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -331,7 +337,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.add_three_numbers",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_reverse(STRING)
@@ -341,7 +348,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.reverse_string",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE FUNCTION py_is_prime(INT)
@@ -351,7 +359,8 @@ PROPERTIES (
"file" = "https://your-storage.com/udf/python_udf_scalar_ops.zip",
"symbol" = "python_udf_scalar_ops.is_prime",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -368,7 +377,8 @@ PROPERTIES (
"file" = "file:///path/to/my_udf.zip",
"symbol" = "my_udf.math_ops.multiply_by_two",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -411,6 +421,7 @@ DROP FUNCTION IF EXISTS py_is_prime(INT);
| `file` | No | - | Path to the Python `.zip` package. Required only in module mode. Supports three protocols:
• `file://`: local file system path
• `http://`: HTTP remote download
• `https://`: HTTPS remote download |
| `runtime_version` | Yes | - | Python runtime version, such as `"3.10.12"`. The complete version number is required |
| `always_nullable` | No | `true` | Whether the function always returns a nullable result |
+| `volatility` | No | `volatile` | Function volatility, supported since 4.1.2. Valid values are `immutable`, `stable`, and `volatile`.
`immutable`: the same input always produces the same output. Most deterministic UDFs should use this value for better plan optimization.
`stable`: the result is stable within one SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled.
`volatile`: the result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled. |
#### Runtime Version Notes
@@ -478,7 +489,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(a, b):
@@ -529,7 +541,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "add",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -552,7 +565,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "to_upper",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -575,7 +589,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "sqrt",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import pandas as pd
@@ -615,7 +630,8 @@ PROPERTIES (
"type"="PYTHON_UDF",
"symbol"="py_mixed_vector_add_impl",
"always_nullable"="true",
- "runtime_version"="3.12.11"
+ "runtime_version"="3.12.11",
+ "volatility"="immutable"
)
AS $$
import pandas as pd
@@ -701,7 +717,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -725,7 +742,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(arr):
@@ -750,7 +768,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m):
@@ -774,7 +793,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(m, key):
@@ -799,7 +819,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s):
@@ -826,7 +847,8 @@ RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(email):
@@ -852,7 +874,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(s1, s2):
@@ -889,7 +912,8 @@ RETURNS INT
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime
@@ -918,7 +942,8 @@ RETURNS BOOLEAN
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
- "runtime_version" = "3.10.12"
+ "runtime_version" = "3.10.12",
+ "volatility" = "immutable"
)
AS $$
def evaluate(id_card):
@@ -1155,7 +1180,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "SumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class SumUDAF:
@@ -1222,7 +1248,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "AvgUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class AvgUDAF:
@@ -1428,7 +1455,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.VarianceUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_stddev(DOUBLE)
@@ -1438,7 +1466,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.StdDevUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE AGGREGATE FUNCTION py_median(DOUBLE)
@@ -1448,7 +1477,8 @@ PROPERTIES (
"file" = "file:///path/to/stats_udaf.zip",
"symbol" = "stats_udaf.MedianUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -1530,6 +1560,7 @@ DROP FUNCTION IF EXISTS py_variance(DOUBLE);
| `file` | No | - | Path to the Python `.zip` package. Required only in module mode. Supports three protocols:
• `file://`: local file system path
• `http://`: HTTP remote download
• `https://`: HTTPS remote download |
| `runtime_version` | Yes | - | Python runtime version, such as `"3.10.12"` |
| `always_nullable` | No | `true` | Whether the function always returns a nullable result |
+| `volatility` | No | `immutable` | Function volatility, supported since 4.1.2. Valid values are `immutable`, `stable`, and `volatile`.
`immutable`: the same input always produces the same output. Most deterministic UDFs should use this value for better plan optimization.
`stable`: the result is stable within one SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled.
`volatile`: the result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled. |
#### runtime_version Notes
@@ -1572,7 +1603,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "RunningSumUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class RunningSumUDAF:
@@ -1648,7 +1680,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "PercentileUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class PercentileUDAF:
@@ -1743,7 +1776,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "CollectSetUDAF",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
class CollectSetUDAF:
@@ -2058,7 +2092,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "split_string_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def split_string_udtf(text, delimiter):
@@ -2094,7 +2129,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "generate_series_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def generate_series_udtf(start, end):
@@ -2146,7 +2182,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "duplicate_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def duplicate_udtf(text, n):
@@ -2180,7 +2217,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "cartesian_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def cartesian_udtf(list1, list2):
@@ -2220,7 +2258,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "explode_json_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import json
@@ -2382,7 +2421,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.split_lines_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_extract_emails(STRING)
@@ -2392,7 +2432,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.extract_emails_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_parse_json(STRING)
@@ -2402,7 +2443,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.parse_json_object_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_expand_json(STRING)
@@ -2412,7 +2454,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.expand_json_array_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
CREATE TABLES FUNCTION py_ngram(STRING, INT)
@@ -2422,7 +2465,8 @@ PROPERTIES (
"file" = "file:///path/to/text_udtf.zip",
"symbol" = "text_udtf.ngram_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
);
```
@@ -2531,6 +2575,7 @@ CREATE TABLES FUNCTION py_split(STRING, STRING) ...;
| `file` | No | - | Path to the Python `.zip` package. Required only in module mode. Supports three protocols:
• `file://`: local file system path
• `http://`: HTTP remote download
• `https://`: HTTPS remote download |
| `runtime_version` | Yes | - | Python runtime version, such as `"3.10.12"` |
| `always_nullable` | No | `true` | Whether the function always returns a nullable result |
+| `volatility` | No | `immutable` | Function volatility, supported since 4.1.2. Valid values are `immutable`, `stable`, and `volatile`.
`immutable`: the same input always produces the same output. Most deterministic UDFs should use this value for better plan optimization.
`stable`: the result is stable within one SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled.
`volatile`: the result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled. |
#### runtime_version Notes
@@ -2562,7 +2607,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_csv_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def parse_csv_udtf(csv_data):
@@ -2605,7 +2651,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "date_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from datetime import datetime, timedelta
@@ -2653,7 +2700,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "tokenize_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import re
@@ -2696,7 +2744,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "parse_url_params_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
from urllib.parse import urlparse, parse_qs
@@ -2742,7 +2791,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "expand_ip_range_udtf",
"runtime_version" = "3.10.12",
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def expand_ip_range_udtf(start_ip, end_ip):
@@ -3099,7 +3149,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- Must specify the complete version number to match Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(x):
@@ -3201,7 +3252,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.9.18", -- Must specify the complete version number to match Python 3.9.18
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
def evaluate(text):
@@ -3215,7 +3267,8 @@ PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11", -- Must specify the complete version number to match Python 3.12.11
- "always_nullable" = "true"
+ "always_nullable" = "true",
+ "volatility" = "immutable"
)
AS $$
import numpy as np
diff --git a/versioned_docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md b/versioned_docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md
index 1de8ba51cb1c6..68e9e5f77b176 100644
--- a/versioned_docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md
+++ b/versioned_docs/version-4.x/sql-manual/sql-statements/function/CREATE-FUNCTION.md
@@ -75,6 +75,10 @@ CREATE [ GLOBAL ]
> - `symbol`: Indicates the class name containing the UDF class. This parameter is mandatory.
> - `type`: Indicates the UDF call type. The default is Native. Use JAVA_UDF when using a Java UDF.
> - `always_nullable`: Indicates whether the UDF result may contain NULL values. This is an optional parameter with a default value of true.
+> - `volatility`: Indicates how stable the function result is. This parameter is supported since 4.1.2. It is optional. The default value is `volatile` for scalar UDFs, and `immutable` for UDAF/UDTF. Valid values are:
+> - `immutable`: The same input always produces the same output. Most deterministic UDFs should be marked as `immutable` so that the optimizer can apply more plan optimizations.
+> - `stable`: The result is stable within a single SQL statement but may change between statements, similar to `now()`. SQL cache and materialized view rewrite are disabled for this type of function.
+> - `volatile`: The result may change on each call, similar to `random()`. SQL cache, materialized view rewrite, and many optimizer rewrite rules are disabled for this type of function.
## Access Control Requirements
@@ -91,7 +95,8 @@ To execute this command, the user must have `ADMIN_PRIV` privileges.
"file"="file:///path/to/java-udf-demo-jar-with-dependencies.jar",
"symbol"="org.apache.doris.udf.AddOne",
"always_nullable"="true",
- "type"="JAVA_UDF"
+ "type"="JAVA_UDF",
+ "volatility"="immutable"
);
```
@@ -135,4 +140,92 @@ To execute this command, the user must have `ADMIN_PRIV` privileges.
```sql
CREATE GLOBAL ALIAS FUNCTION id_masking(INT) WITH PARAMETER(id) AS CONCAT(LEFT(id, 3), '****', RIGHT(id, 4));
- ```
\ No newline at end of file
+ ```
+
+6. Create a volatile Python UDF. Functions such as `uuid.uuid4()` that depend on randomness should keep the default `volatility = volatile` and must not be incorrectly marked as `immutable`.
+
+ ```sql
+ CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num" = "1");
+ INSERT INTO cte_uuid_seed VALUES (1),(2),(3);
+
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "volatile"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+
+ SET enable_cte_materialize = true;
+ SET inline_cte_referenced_threshold = 10;
+
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ Correct result:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 1 |
+ | 2 | 1 |
+ | 3 | 1 |
+ +------+-----------------+
+ ```
+
+ For this function, the following definition is incorrect:
+
+ ```sql
+ DROP FUNCTION IF EXISTS py_uuid_token(INT);
+ CREATE FUNCTION py_uuid_token(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_uuid_token_impl",
+ "always_nullable" = "false",
+ "runtime_version" = "3.12.11",
+ "volatility" = "immutable"
+ )
+ AS $$
+import uuid
+def py_uuid_token_impl(x):
+ return f"{x}-{uuid.uuid4()}"
+$$;
+ ```
+
+ Run the same query again:
+
+ ```sql
+ WITH cte AS (SELECT id, py_uuid_token(id) AS token FROM cte_uuid_seed)
+ SELECT id, COUNT(DISTINCT token) AS distinct_tokens
+ FROM (SELECT id, token FROM cte UNION ALL SELECT id, token FROM cte) u
+ GROUP BY id ORDER BY id;
+ ```
+
+ Incorrect result:
+
+ ```text
+ +------+-----------------+
+ | id | distinct_tokens |
+ +------+-----------------+
+ | 1 | 2 |
+ | 2 | 2 |
+ | 3 | 2 |
+ +------+-----------------+
+ ```
+
+ Why this is wrong:
+ Because `py_uuid_token` is volatile, each call to `uuid.uuid4()` generates a new value. If the function is incorrectly marked as `volatility = immutable`, the optimizer may treat repeated references as safe to rewrite and may choose a plan that evaluates the UDF separately on both sides of `UNION ALL`. As a result, the same `id` can produce two different `token` values, and `COUNT(DISTINCT token)` changes from `1` to `2`.
\ No newline at end of file