diff --git a/.bumpversion.toml b/.bumpversion.toml index e50d6af..3aa0317 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -12,7 +12,7 @@ # limitations under the License. [tool.bumpversion] -current_version = "2.0.0" +current_version = "2.1.0-rc.1" commit = true message = "Update version {current_version} -> {new_version}" ignore_missing_version = true diff --git a/CHANGELOG.md b/CHANGELOG.md index 2aff2e4..c4f3d17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# [2.1.0-rc.1](https://github.com/IBM/data-intelligence-sdk/compare/v2.0.0...v2.1.0-rc.1) (2026-05-20) + + +### Features + +* **dq:** Sync from enterprise cf57469 on 2026-05-20 ([8f0474a](https://github.com/IBM/data-intelligence-sdk/commit/8f0474ac4c8890bc1171367f616a32f7c900fdbb)) + # [2.0.0](https://github.com/IBM/data-intelligence-sdk/compare/v1.0.0...v2.0.0) (2026-04-23) diff --git a/README.md b/README.md index 9040d1a..48566a6 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -# IBM watsonx.data intelligence SDK Version 2.0.0 +# IBM watsonx.data intelligence SDK Version 2.1.0-rc.1 A comprehensive Python SDK for data intelligence operations including: - **Data Quality Validation**: Validate streaming data records, Pandas DataFrames, and PySpark DataFrames @@ -295,7 +295,7 @@ container_response = dph_service.initialize( # Create a data product data_product = dph_service.create_data_product( drafts=[{ - 'version': '2.0.0', + 'version': '2.1.0-rc.1', 'name': 'My Data Product', 'description': 'A sample data product', 'asset': { @@ -1186,5 +1186,5 @@ For issues, questions, or contributions, please open an issue on GitHub. - pytest-cov >= 4.0.0 - pytest-mock >= 3.7.0 - black >= 26.3.1 -- mypy >= 2.0.0 +- mypy >= 1.0.0 diff --git a/docs/README.md b/docs/README.md index 3fe648e..f871867 100644 --- a/docs/README.md +++ b/docs/README.md @@ -26,6 +26,7 @@ docs/ ├── requirements.txt # Documentation dependencies ├── build_docs.py # Build script ├── README.md # This file +├── CONTRIBUTING_TO_DOCS.md # Documentation contribution guide │ ├── _static/ # Static assets │ ├── css/ @@ -39,11 +40,17 @@ docs/ │ ├── 02_overview/ # Features and release notes │ ├── 03_common_modules/ # Shared authentication │ ├── 04_dq_validator/ # DQ Validator module -│ └── 05_future_modules/ # Future module guidelines +│ ├── 05_dph_services/ # Data Product Hub Services +│ ├── 06_odcs_generator/ # ODCS Generator +│ ├── 07_data_product_recommender/ # Data Product Recommender +│ └── 08_future_modules/ # Future module guidelines │ └── api/ # API reference ├── common/ # Common modules API - └── dq_validator/ # DQ Validator API + ├── dq_validator/ # DQ Validator API + ├── dph_services/ # DPH Services API + ├── odcs_generator/ # ODCS Generator Class Reference + └── data_product_recommender/ # Data Product Recommender Class Reference ``` ## Building Documentation Locally diff --git a/docs/chapters/01_welcome/index.rst b/docs/chapters/01_welcome/index.rst index 1700b6d..b800c8e 100644 --- a/docs/chapters/01_welcome/index.rst +++ b/docs/chapters/01_welcome/index.rst @@ -24,7 +24,27 @@ If you're new to the SDK or installing it for the first time, be sure to check o You can find details on the latest releases, FAQs, known issues, and more in the :ref:`Overview` section. -If you already have the SDK installed and are looking to get started using it, please refer to the :ref:`Common Modules` section for authentication setup, and the :ref:`DQ Validator` section for data quality validation. +SDK Modules +----------- + +The SDK provides several powerful modules for data intelligence and governance: + +**Data Quality Validator** + Comprehensive data quality validation framework with support for multiple check types, integration with Pandas and PySpark DataFrames, and CEL (Common Expression Language) for complex validation rules. See :ref:`DQ Validator` for details. + +**Data Product Hub Services** + Python client library for IBM Data Product Hub API, enabling programmatic management of data products, containers, contract terms, and the complete data product lifecycle from creation to retirement. See :ref:`Data Product Hub Services` for details. + +**ODCS Generator** + Automated generation of Open Data Contract Standard (ODCS) v3.1.0 compliant YAML files from enterprise data catalogs including Collibra and Informatica CDGC. Streamlines data contract creation by extracting and transforming catalog metadata. See :ref:`ODCS Generator` for details. + +**Data Product Recommender** + Intelligent analysis of database query logs to identify high-value tables and logical groupings for data product prioritization. Supports multiple platforms (Snowflake, Databricks, BigQuery, watsonx.data) and provides actionable recommendations based on usage patterns. See :ref:`Data Product Recommender` for details. + +Getting Started +--------------- + +If you already have the SDK installed and are looking to get started using it, please refer to the :ref:`Common Modules` section for authentication setup, and explore the individual module documentation for specific use cases. Looking for documentation on the SDK's interfaces and abstractions? Please check out our :ref:`API Reference Documentation` for an in-depth breakdown of all the SDK's classes, properties, and methods - including detailed descriptions of any required or optional parameters. diff --git a/docs/chapters/01_welcome/installation.rst b/docs/chapters/01_welcome/installation.rst index 0ca04d3..009017c 100644 --- a/docs/chapters/01_welcome/installation.rst +++ b/docs/chapters/01_welcome/installation.rst @@ -100,7 +100,7 @@ To verify that the SDK is installed correctly: >>> import wxdi.dq_validator >>> from wxdi.common.auth import AuthProvider >>> print(dq_validator.__version__) - 2.0.0 + 2.1.0-rc.1 Versioning ---------- @@ -116,7 +116,7 @@ Version numbers follow the format ``MAJOR.MINOR.PATCH``: Current Version ~~~~~~~~~~~~~~~ -The current version of the SDK is **2.0.0**. +The current version of the SDK is **2.1.0-rc.1**. Checking Your Version ~~~~~~~~~~~~~~~~~~~~~ @@ -133,7 +133,7 @@ Or programmatically: >>> import wxdi.dq_validator >>> print(dq_validator.__version__) - 2.0.0 + 2.1.0-rc.1 Upgrading --------- diff --git a/docs/chapters/02_overview/features.rst b/docs/chapters/02_overview/features.rst index 17e5cef..0b6fdc5 100644 --- a/docs/chapters/02_overview/features.rst +++ b/docs/chapters/02_overview/features.rst @@ -157,6 +157,136 @@ Type Safety * IDE autocomplete and type checking support * Runtime type validation +DPH Services Module +------------------- + +Python client library for IBM Data Product Hub API, providing programmatic access to data product management. + +Container Management +~~~~~~~~~~~~~~~~~~~~ + +* Initialize and configure data product containers +* Manage delivery methods and domain structures +* Service credential management +* API key operations + +Data Product Lifecycle +~~~~~~~~~~~~~~~~~~~~~~ + +* Create, update, and delete data products +* Draft management with version control +* Publish drafts to releases +* Retire releases when needed +* Pagination support for large datasets + +Contract Terms +~~~~~~~~~~~~~~ + +* Manage contract terms and documents +* Create reusable contract templates +* Attach terms and conditions to data products +* Service level agreement management + +Domain Organization +~~~~~~~~~~~~~~~~~~~ + +* Create and manage domains and subdomains +* Organize data products by business area +* Multi-industry domain support +* Hierarchical domain structures + +Asset Visualization +~~~~~~~~~~~~~~~~~~~ + +* Create data asset visualizations +* Reinitiate visualizations with updated assets +* Support for multiple assets per visualization + +ODCS Generator Module +--------------------- + +Automated generation of Open Data Contract Standard (ODCS) v3.1.0 compliant YAML files from data catalog metadata. + +Multi-Catalog Support +~~~~~~~~~~~~~~~~~~~~~ + +* **Collibra Integration**: Extract metadata from Collibra data catalog +* **Informatica CDGC**: Extract metadata from Informatica Cloud Data Governance and Catalog +* Extensible architecture for additional catalog sources + +Metadata Extraction +~~~~~~~~~~~~~~~~~~~ + +* Automatic asset metadata extraction via REST APIs +* Column discovery through catalog relations +* Data type mapping (logical and physical) +* Classification support via GraphQL (Collibra) +* Tag integration at asset and column levels +* Custom attribute preservation + +ODCS Generation +~~~~~~~~~~~~~~~ + +* ODCS v3.1.0 compliant YAML output +* Complete schema definition with column metadata +* Data quality rules integration +* Service level agreement specifications +* Governance and ownership information + +Data Type Mapping +~~~~~~~~~~~~~~~~~ + +* Intelligent mapping of catalog types to ODCS types +* Support for logical types (string, integer, number, timestamp, boolean) +* Physical type preservation with precision and scale +* Custom type mapping support + +Data Product Recommender Module +-------------------------------- + +Analyze database query logs to identify high-value tables and logical groupings for data product prioritization. + +Multi-Platform Support +~~~~~~~~~~~~~~~~~~~~~~ + +* **Snowflake**: Query log analysis from ACCOUNT_USAGE.QUERY_HISTORY +* **Databricks**: Query log analysis from system.query.history +* **BigQuery**: Query log analysis from INFORMATION_SCHEMA.JOBS_BY_PROJECT +* **watsonx.data**: Query log analysis from system.runtime.queries + +Intelligent Scoring +~~~~~~~~~~~~~~~~~~~ + +* Query frequency analysis (37.5% weight) +* User diversity metrics (37.5% weight) +* Recency scoring (15% weight) +* Consistency patterns (10% weight) +* Customizable scoring weights + +Table Grouping +~~~~~~~~~~~~~~ + +* Identify tables frequently used together +* Cohesion analysis for logical groupings +* User reach metrics across groups +* Group scoring with multiple factors + +Output Formats +~~~~~~~~~~~~~~ + +* **Markdown**: Human-readable reports with tables and formatting +* **JSON**: Machine-readable format for automation and AI agents +* Star ratings (1-5 stars) for quick assessment +* Detailed metrics and query pattern analysis + +CLI and Python API +~~~~~~~~~~~~~~~~~~ + +* Command-line interface for quick analysis +* Python API for programmatic integration +* File-based input (CSV and JSON) +* Configurable output directory and format + Future Modules -------------- diff --git a/docs/chapters/04_dq_validator/cel_expressions.rst b/docs/chapters/04_dq_validator/cel_expressions.rst new file mode 100644 index 0000000..33fd04d --- /dev/null +++ b/docs/chapters/04_dq_validator/cel_expressions.rst @@ -0,0 +1,888 @@ +CEL Expression Validation +========================== + +Overview +-------- + +The Data Intelligence SDK supports **CEL (Common Expression Language)** for defining custom validation rules. CEL is a non-Turing complete expression language developed by Google that provides a safe, fast way to evaluate expressions without the security risks of arbitrary code execution. + +CEL expressions allow you to: + +- Define complex validation logic without writing Python code +- Reference multiple columns in a single validation rule +- Use conditional logic (ternary operators) for context-dependent validation +- Perform string operations, arithmetic, and logical comparisons +- Validate data against business rules that span multiple fields + +.. warning:: + **Column Names are CASE-SENSITIVE** + + CEL expressions use exact string matching for column names. ``birth_date`` and ``Birth_date`` are different columns. + ``firstName`` and ``First_Name`` are different columns. Always use the exact column name as defined in your metadata. + + **Examples:** + + - ✅ Correct: ``birth_date != null`` (matches metadata column ``birth_date``) + - ❌ Wrong: ``Birth_date != null`` (case mismatch) + - ❌ Wrong: ``BIRTH_DATE != null`` (case mismatch) + - ❌ Wrong: ``birthDate != null`` (different name) + +Installation +------------ + +CEL support requires the ``cel-python`` package: + +.. code-block:: bash + + pip install cel-python>=0.5.0 + +Or install the full SDK which includes CEL support: + +.. code-block:: bash + + pip install data-intelligence-sdk + +Complete Examples +----------------- + +For complete working examples, see: + +- ``examples/cel_usage.py`` - CEL expressions with batch validation +- ``examples/cel_pandas_dataframe_usage.py`` - CEL expressions with Pandas DataFrames + +- ``examples/table_cel_usage.py`` - Table-level CEL expressions for cross-column validation + +CEL Validation Types +-------------------- + +The SDK supports two types of CEL validation: + +**Column-Level CEL (CELCheck)** + Validates individual column values. Has access to the ``value`` variable representing the current column being validated. + + Use for: Single-column validation, value range checks, format validation. + +**Table-Level CEL (TableCELCheck)** + Validates entire records for cross-column business logic. Does NOT have a ``value`` variable since it validates the whole record. + + Use for: Cross-column validation, multi-field business rules, date consistency checks. + +.. code-block:: python + + from wxdi.dq_validator import ( + Validator, ValidationRule, TableValidationRule, + CELCheck, TableCELCheck + ) + + validator = Validator(metadata) + + # Column-level: Validates 'salary' column + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck('value > 0')) + ) + + # Table-level: Validates entire record + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(TableCELCheck('salary > min_salary && age >= 18')) + ) + + +Basic Usage +----------- + +Simple Value Validation +~~~~~~~~~~~~~~~~~~~~~~~ + +The most basic CEL expression validates a single value: + +.. code-block:: python + + from wxdi.dq_validator import Validator, ValidationRule, CELCheck + + # Create validator + validator = Validator(metadata) + + # Add CEL check for positive values + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value > 0', + error_message='Salary must be positive' + )) + ) + +Multi-Column Validation +~~~~~~~~~~~~~~~~~~~~~~~ + +CEL expressions can reference other columns in the same record directly by column name: + +.. code-block:: python + + # Salary must exceed minimum salary (SIMPLE SYNTAX - RECOMMENDED) + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value > min_salary', + error_message='Salary below minimum threshold' + )) + ) + + # Alternative: Explicit syntax with 'record.' prefix (also supported) + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value > record.min_salary', + error_message='Salary below minimum threshold' + )) + ) + +.. note:: + **Both syntaxes work identically!** The simple syntax (``min_salary``) is recommended for better readability, + especially for clients who may not be familiar with CEL. The explicit syntax (``record.min_salary``) is still + supported for advanced users who prefer namespace clarity. + +Available Variables +------------------- + +CEL expressions have access to the following variables: + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Variable + - Description + * - ``value`` + - The current column value being validated + * - ``column_name`` (any) + - **Direct column access**: Reference any column by name (e.g., ``min_salary``, ``age``, ``department``) + * - ``record`` + - Dictionary-like object for explicit access (e.g., ``record.min_salary``) - optional, use for clarity + * - ``column_name`` + - Name of the column being validated (string) + * - ``record_index`` + - Position of the record in the batch (integer, 0-based) + +**Syntax Options:** + +You can reference other columns in two ways: + +1. **Simple Syntax (Recommended):** ``min_salary``, ``age``, ``department`` + + - More intuitive for clients + - Cleaner, easier to read + - No namespace prefix needed + +2. **Explicit Syntax (Optional):** ``record.min_salary``, ``record.age``, ``record.department`` + + - Provides namespace clarity + - Useful when you want to be explicit + - Required for columns with reserved names (see below) + +.. warning:: + **Reserved Column Names:** If your data has columns named ``value``, ``column_name``, ``record_index``, or ``record``, + you **must** use the explicit syntax (``record.value``) to access them. The simple syntax won't work for these + reserved names to avoid conflicts with CEL's built-in variables. + + Example: If you have a column named "value", use ``record.value`` instead of just ``value``. + +Supported Operators +------------------- + +Comparison Operators +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Equal to + CELCheck('value == 100') + + # Not equal to + CELCheck('value != 0') + + # Greater than + CELCheck('value > 50') + + # Greater than or equal + CELCheck('value >= 50') + + # Less than + CELCheck('value < 100') + + # Less than or equal + CELCheck('value <= 100') + +Logical Operators +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # AND operator + CELCheck('value > 0 && value < 100') + + # OR operator + CELCheck('value < 0 || value > 100') + + # NOT operator + CELCheck('!(value == 0)') + +Arithmetic Operators +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Addition + CELCheck('value == record.base_salary + record.bonus') + + # Subtraction + CELCheck('value == record.total - record.deductions') + + # Multiplication + CELCheck('value == record.price * record.quantity') + + # Division + CELCheck('value == record.total / record.count') + + # Modulo + CELCheck('value % 10 == 0') # Must be multiple of 10 + +String Operations +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Starts with + CELCheck('value.startsWith("admin_")') + + # Ends with + CELCheck('value.endsWith("@company.com")') + + # Contains (using 'in' operator) + CELCheck('"@" in value') + +List Operations +~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Value in list + CELCheck('value in ["Active", "Pending", "Approved"]') + + # Value not in list + CELCheck('!(value in ["Deleted", "Archived"])') + +Conditional Logic +----------------- + +Ternary Operator +~~~~~~~~~~~~~~~~ + +CEL supports ternary (conditional) expressions using the ``? :`` syntax: + +.. code-block:: python + + # Age-based salary requirements + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='record.age > 40 ? value >= 80000 : value >= 50000', + error_message='Salary does not meet age-based requirements' + )) + ) + +This expression reads as: "If age > 40, then salary must be >= 80000, otherwise salary must be >= 50000" + +Complex Conditions +~~~~~~~~~~~~~~~~~~ + +You can nest conditions and combine them with logical operators: + +.. code-block:: python + + # Department-based bonus limits + validator.add_rule( + ValidationRule('bonus') + .add_check(CELCheck( + expression='record.department == "Sales" ? value <= 20000 : value <= 10000', + error_message='Bonus exceeds department limit' + )) + ) + +Advanced Examples +----------------- + +Range Validation +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Value must be between two columns + CELCheck('value >= record.min_value && value <= record.max_value') + +Business Rule Validation +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Sales employees must be at least 21 + validator.add_rule( + ValidationRule('age') + .add_check(CELCheck( + expression='value >= 21 || record.department != "Sales"', + error_message='Sales employees must be at least 21 years old' + )) + ) + +Email Domain Validation +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Email must be from company domain + validator.add_rule( + ValidationRule('email') + .add_check(CELCheck( + expression='value.endsWith("@company.com")', + error_message='Email must be from company domain' + )) + ) + +Status Validation +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Status must be one of allowed values + validator.add_rule( + ValidationRule('status') + .add_check(CELCheck( + expression='value in ["Active", "Pending", "Approved"]', + error_message='Invalid status value' + )) + ) + +Combining with Other Checks +---------------------------- + +CEL checks can be combined with other validation checks: + +.. code-block:: python + + from wxdi.dq_validator import CompletenessCheck, RangeCheck + + validator.add_rule( + ValidationRule('salary') + .add_check(CompletenessCheck()) # Must not be null + .add_check(RangeCheck(min_value=0, max_value=1000000)) # Range check + .add_check(CELCheck('value > record.min_salary')) # CEL check + ) + +Error Handling +-------------- + +Compilation Errors +~~~~~~~~~~~~~~~~~~ + +CEL expressions are compiled at initialization. If an expression has syntax errors, a ``CELCompilationError`` is raised immediately: + +.. code-block:: python + + from wxdi.dq_validator.cel_exceptions import CELCompilationError + + try: + check = CELCheck('value >') # Incomplete expression + except CELCompilationError as e: + print(f"Invalid CEL expression: {e}") + +Runtime Errors +~~~~~~~~~~~~~~ + +If an error occurs during evaluation (e.g., type mismatch, null reference), the check returns a ``ValidationError`` rather than raising an exception: + +.. code-block:: python + + # This will handle null values gracefully + check = CELCheck('value != null') + + # Validation will return an error if evaluation fails + error = check.validate(None, context) + if error: + print(error.message) + +Best Practices +-------------- + +1. **Keep Expressions Simple** + + - Prefer simple, readable expressions over complex nested logic + - Break complex rules into multiple checks when possible + +2. **Use Descriptive Error Messages** + + .. code-block:: python + + CELCheck( + expression='value > 0', + error_message='Salary must be a positive number' + ) + +3. **Test Expressions with Sample Data** + + - Verify expressions work with your actual data before deployment + - Test edge cases (null values, boundary conditions) + +4. **Consider Performance** + + - CEL expressions are compiled once and reused + - Evaluation is very fast (~10-100 microseconds per record) + - Suitable for high-throughput validation + +5. **Document Complex Logic** + + .. code-block:: python + + # Senior employees (age > 40) must earn at least $80,000 + # Junior employees must earn at least $50,000 + CELCheck( + expression='record.age > 40 ? value >= 80000 : value >= 50000', + description='Age-based salary requirements' + ) + +Integration with DataFrames +---------------------------- + +CEL checks work seamlessly with both Pandas and Spark DataFrames: + +Pandas Integration +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from wxdi.dq_validator import PandasValidator + import pandas as pd + + # Create DataFrame + df = pd.DataFrame({ + 'emp_id': [1001, 1002], + 'salary': [75000, 85000], + 'min_salary': [60000, 70000] + }) + + # Validate with CEL + validator = PandasValidator(metadata) + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck('value > record.min_salary')) + ) + + results = validator.validate(df) + +Spark Integration +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from wxdi.dq_validator import SparkValidator + from pyspark.sql import SparkSession + + # Create Spark DataFrame + spark = SparkSession.builder.getOrCreate() + df = spark.createDataFrame([ + (1001, 75000, 60000), + (1002, 85000, 70000) + ], ['emp_id', 'salary', 'min_salary']) + + # Validate with CEL + validator = SparkValidator(metadata) + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck('value > record.min_salary')) + ) + + results = validator.validate(df) + +Limitations +----------- + +1. **Non-Turing Complete** + + - CEL does not support loops or recursion + - Cannot define custom functions + - This is by design for security and performance + +2. **Expression Length** + + - Maximum expression length: 1000 characters + - This prevents abuse and ensures reasonable performance + +3. **Type Safety** + + - CEL expressions must return boolean values + - Type mismatches are caught at runtime and reported as validation errors + +4. **No Side Effects** + + - CEL expressions cannot modify data + - They can only read values and return boolean results + +API Reference +------------- + +CELCheck Class +~~~~~~~~~~~~~~ + +.. code-block:: python + + class CELCheck(BaseCheck): + def __init__( + self, + expression: str, + error_message: Optional[str] = None, + dimension: DataQualityDimension = DataQualityDimension.VALIDITY, + description: Optional[str] = None + ) + +**Parameters:** + +- ``expression`` (str): CEL expression that must evaluate to boolean +- ``error_message`` (str, optional): Custom error message for validation failures +- ``dimension`` (DataQualityDimension, optional): Data quality dimension (default: VALIDITY) +- ``description`` (str, optional): Human-readable description of the check + +**Methods:** + +- ``validate(value, context)``: Validate a value using the CEL expression +- ``get_expression()``: Get the CEL expression string +- ``get_description()``: Get the check description + +CEL Exceptions +~~~~~~~~~~~~~~ + +.. code-block:: python + + from wxdi.dq_validator.cel_exceptions import ( + CELError, # Base exception + CELCompilationError, # Syntax errors at initialization + CELEvaluationError # Runtime errors during evaluation + ) + +Table-Level CEL Validation +--------------------------- + +Table-level CEL validation enables cross-column business rules and complex validation logic that spans multiple fields. + +Overview +~~~~~~~~ + +Unlike column-level CEL (``CELCheck``) which validates individual column values, table-level CEL (``TableCELCheck``) validates entire records. This is essential for: + +- Cross-column validation (e.g., ``start_date < end_date``) +- Complex business rules spanning multiple fields +- Conditional logic based on multiple columns +- Record-level consistency checks + +Key Differences +~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - Column-Level CEL (CELCheck) + - Table-Level CEL (TableCELCheck) + * - Validates single column value + - Validates entire record + * - Has ``value`` variable + - NO ``value`` variable + * - Use: ``ValidationRule('column')`` + - Use: ``TableValidationRule('rule_name')`` + * - Example: ``value > 0`` + - Example: ``salary > min_salary`` + +Basic Table-Level Validation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from wxdi.dq_validator import ( + Validator, TableValidationRule, TableCELCheck + ) + + validator = Validator(metadata) + + # Simple cross-column comparison + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(TableCELCheck( + 'salary > min_salary', + error_message='Salary must exceed minimum' + )) + ) + +Complex Business Rules +~~~~~~~~~~~~~~~~~~~~~~ + +Table-level CEL excels at complex, multi-field business logic: + +.. code-block:: python + + # Age-based salary requirements + validator.add_table_rule( + TableValidationRule('age_salary_check') + .add_check(TableCELCheck( + 'age > 40 ? salary >= 80000 : salary >= 50000', + error_message='Salary does not meet age-based requirements' + )) + ) + + # Department-specific rules + validator.add_table_rule( + TableValidationRule('dept_rules') + .add_check(TableCELCheck( + 'department == "Sales" ? (salary >= 50000 && age >= 21) : salary >= 40000', + error_message='Department requirements not met' + )) + ) + + # Date consistency + validator.add_table_rule( + TableValidationRule('date_check') + .add_check(TableCELCheck( + 'start_date < end_date', + error_message='Start date must be before end date' + )) + ) + +Multiple Table Rules +~~~~~~~~~~~~~~~~~~~~ + +You can combine multiple table-level rules for comprehensive validation: + +.. code-block:: python + + validator = Validator(metadata) + + # Rule 1: Salary validation + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(TableCELCheck('salary > min_salary')) + ) + + # Rule 2: Age validation + validator.add_table_rule( + TableValidationRule('age_check') + .add_check(TableCELCheck('age >= 18 && age <= 65')) + ) + + # Rule 3: Bonus limits + validator.add_table_rule( + TableValidationRule('bonus_check') + .add_check(TableCELCheck('bonus <= salary * 0.3')) + ) + +Combining Column and Table Rules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For comprehensive validation, combine both column-level and table-level rules: + +.. code-block:: python + + validator = Validator(metadata) + + # Column-level: Individual field validation + validator.add_rule( + ValidationRule('email') + .add_check(CompletenessCheck()) + .add_check(FormatCheck('email')) + ) + + validator.add_rule( + ValidationRule('age') + .add_check(RangeCheck(min_value=0, max_value=120)) + ) + + # Table-level: Cross-field business rules + validator.add_table_rule( + TableValidationRule('business_rules') + .add_check(TableCELCheck( + 'salary > min_salary && age >= 18', + error_message='Invalid salary/age combination' + )) + ) + +Available Variables +~~~~~~~~~~~~~~~~~~~ + +Table-level CEL expressions have access to: + +- **Column names**: Direct access to any column (e.g., ``salary``, ``age``, ``department``) +- **record**: Dictionary of all column values (e.g., ``record.salary``, ``record.age``) +- **record_index**: Position of the record in the batch + +**Note:** Unlike column-level CEL, there is NO ``value`` or ``column_name`` variable. + +Performance Optimization +~~~~~~~~~~~~~~~~~~~~~~~~ + +Table-level CEL automatically optimizes for wide tables by extracting only required columns from the expression: + +.. code-block:: python + + # Expression: 'salary > min_salary && age >= 18' + # Only adds: salary, min_salary, age to context + # Not all 100+ columns + + check = TableCELCheck('salary > min_salary && age >= 18') + # check._required_columns = {'salary', 'min_salary', 'age'} + +This optimization is critical for assets with many columns (100+) to reduce memory usage and improve performance. + +Column Reference Validation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Validate column references before runtime: + +.. code-block:: python + + check = TableCELCheck('salary > max_salary') # max_salary doesn't exist + + # Validate against metadata + try: + check.validate_column_references([c.name for c in metadata.columns]) + except ValueError as e: + print(e) + # CEL expression references non-existent column(s): + # - 'max_salary' not found + # + # ⚠️ Column names are CASE-SENSITIVE. + # Available columns: 'salary', 'min_salary', 'age', ... + +Best Practices +~~~~~~~~~~~~~~ + +1. **Use table-level for cross-column validation**: When validation depends on multiple fields +2. **Use column-level for single-field checks**: When validating individual column values +3. **Combine both approaches**: For comprehensive validation coverage +4. **Keep expressions readable**: Break complex logic into multiple rules +5. **Use descriptive rule names**: For better error tracking and debugging +6. **Validate column references**: Call ``validate_column_references()`` after initialization + +Complete Table-Level Example +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See ``examples/table_cel_usage.py`` for a complete working example demonstrating: + +- Multi-column comparisons +- Complex business rules +- Department-specific validation +- Date consistency checks +- Combining column and table rules + + +Complete Example +---------------- + +Here's a complete example demonstrating various CEL features: + +.. code-block:: python + + from wxdi.dq_validator import ( + Validator, ValidationRule, CELCheck, + AssetMetadata, ColumnMetadata, DataType + ) + + # Define metadata + metadata = AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING), + ColumnMetadata('email', DataType.STRING), + ColumnMetadata('age', DataType.INTEGER), + ColumnMetadata('department', DataType.STRING), + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL), + ColumnMetadata('bonus', DataType.DECIMAL), + ColumnMetadata('status', DataType.STRING) + ] + ) + + # Create validator with CEL checks + validator = Validator(metadata) + + # Simple value validation + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck('value > 0')) + ) + + # Multi-column comparison (SIMPLE SYNTAX) + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck('value > min_salary')) + ) + + # Conditional logic (SIMPLE SYNTAX) + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + 'age > 40 ? value >= 80000 : value >= 50000', + error_message='Salary does not meet age-based requirements' + )) + ) + + # String validation + validator.add_rule( + ValidationRule('email') + .add_check(CELCheck( + 'value.endsWith("@company.com")', + error_message='Email must be from company domain' + )) + ) + + # List membership + validator.add_rule( + ValidationRule('status') + .add_check(CELCheck( + 'value in ["Active", "Pending", "Approved"]', + error_message='Invalid status value' + )) + ) + + # Department-based rules (SIMPLE SYNTAX) + validator.add_rule( + ValidationRule('bonus') + .add_check(CELCheck( + 'department == "Sales" ? value <= 20000 : value <= 10000', + error_message='Bonus exceeds department limit' + )) + ) + + # Validate records + records = [ + [1001, 'John Doe', 'john@company.com', 30, 'Engineering', 75000, 60000, 5000, 'Active'], + [1002, 'Jane Smith', 'jane@company.com', 45, 'Sales', 85000, 70000, 18000, 'Active'] + ] + + results = validator.validate_batch(records) + + # Process results + for idx, result in enumerate(results): + if result.is_valid: + print(f"Record {idx + 1}: PASS") + else: + print(f"Record {idx + 1}: FAIL") + for error in result.errors: + print(f" - {error.column_name}: {error.message}") + +See Also +-------- + +- :doc:`validation_checks` - Overview of all validation check types +- :doc:`core_concepts` - Core concepts of the DQ Validator +- :doc:`examples` - More examples and use cases +- `CEL Specification `_ - Official CEL language specification + +.. Made with Bob diff --git a/docs/chapters/04_dq_validator/examples.rst b/docs/chapters/04_dq_validator/examples.rst index d507e04..b15bfc9 100644 --- a/docs/chapters/04_dq_validator/examples.rst +++ b/docs/chapters/04_dq_validator/examples.rst @@ -28,6 +28,13 @@ Basic Validation See ``examples/basic_usage.py`` for array-based record validation. +CEL Expression Validation +-------------------------- + +* ``examples/cel_usage.py`` - Column-level CEL expressions with batch validation +* ``examples/table_cel_usage.py`` - Table-level CEL expressions for cross-column validation +* ``examples/cel_pandas_dataframe_usage.py`` - CEL expressions with Pandas DataFrames + DataFrame Validation -------------------- diff --git a/docs/chapters/04_dq_validator/index.rst b/docs/chapters/04_dq_validator/index.rst index ba2ad5b..1a43a33 100644 --- a/docs/chapters/04_dq_validator/index.rst +++ b/docs/chapters/04_dq_validator/index.rst @@ -33,8 +33,8 @@ Key Capabilities **Validation Engine** Core validation framework with metadata-driven rules and fluent API -**Nine Check Types** - Comprehensive validation coverage including length, format, datatype, range, regex, and more +**Ten Check Types** + Comprehensive validation coverage including length, format, datatype, range, regex, CEL expressions, and more **Data Quality Dimensions** Track validations across 8 standard DQ dimensions (Accuracy, Completeness, Conformity, etc.) @@ -51,6 +51,7 @@ Key Capabilities core_concepts validation_checks + cel_expressions dataframe_integration rest_api_integration examples diff --git a/docs/chapters/04_dq_validator/validation_checks.rst b/docs/chapters/04_dq_validator/validation_checks.rst index 73eeae7..aabd738 100644 --- a/docs/chapters/04_dq_validator/validation_checks.rst +++ b/docs/chapters/04_dq_validator/validation_checks.rst @@ -18,7 +18,7 @@ Validation Checks ================= -The DQ Validator module provides nine comprehensive validation check types. +The DQ Validator module provides ten comprehensive validation check types. .. note:: This section provides an overview. Detailed API documentation with all parameters is available in the :ref:`API Reference`. @@ -35,6 +35,35 @@ Available Checks 7. **RegexCheck** - Validates regex patterns 8. **FormatCheck** - Validates value formats 9. **DataTypeCheck** - Validates data types +10. **CELCheck** - Validates using CEL (Common Expression Language) expressions + +CEL Expression Check +-------------------- + +The **CELCheck** enables custom validation logic using Google's Common Expression Language (CEL). This powerful check type allows you to: + +- Define complex validation rules without writing Python code +- Reference multiple columns in a single expression +- Use conditional logic for context-dependent validation +- Perform string operations, arithmetic, and logical comparisons + +Quick Example +~~~~~~~~~~~~~ + +.. code-block:: python + + from wxdi.dq_validator import CELCheck + + # Simple value check + check = CELCheck('value > 0') + + # Multi-column comparison + check = CELCheck('value > record.min_salary') + + # Conditional logic + check = CELCheck('record.age > 40 ? value >= 80000 : value >= 50000') + +For comprehensive documentation on CEL expressions, see :doc:`cel_expressions`. For detailed usage examples and API documentation, see the :ref:`API Reference`. diff --git a/docs/conf.py b/docs/conf.py index 917b9ea..03f8a43 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -38,9 +38,9 @@ # the built documents. # # The short X.Y version. -version = "2.0.0" +version = "2.1.0-rc.1" # The full version, including alpha/beta/rc tags. -release = "2.0.0" +release = "2.1.0-rc.1" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/requirements.txt b/docs/requirements.txt index fe3a9ed..aa2b892 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -12,9 +12,11 @@ # limitations under the License. # Documentation build dependencies for IBM watsonx.data intelligence SDK -sphinx>=7.0.0 +# Note: requests vulnerability CVE-2026-25645 is addressed via sphinx dependencies +sphinx>=7.4.0 sphinx-book-theme>=1.0.0 sphinx-autodoc-typehints>=1.24.0 sphinx-copybutton>=0.5.0 sphinx-favicon>=1.0.0 + # sphinxcontrib-autodoc-pydantic - Optional, for enhanced Pydantic model documentation \ No newline at end of file diff --git a/examples/cel_pandas_dataframe_usage.py b/examples/cel_pandas_dataframe_usage.py new file mode 100644 index 0000000..9482325 --- /dev/null +++ b/examples/cel_pandas_dataframe_usage.py @@ -0,0 +1,400 @@ +""" + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +""" +CEL Validation with Pandas DataFrames Example + +This example demonstrates how to use CEL (Common Expression Language) +expressions for custom validation logic with Pandas DataFrames. + +CEL provides flexible, safe expression evaluation for complex business rules +that go beyond the capabilities of predefined validation checks. + +Key Features Demonstrated: +- CEL expressions with pandas DataFrames +- Simple syntax for column references (e.g., 'salary > min_salary') +- Complex multi-column business rules +- Memory-efficient chunked processing +- Validation result analysis and filtering +""" + +import pandas as pd +from wxdi.dq_validator import ( + AssetMetadata, ColumnMetadata, DataType, + Validator, ValidationRule, + CELCheck, CompletenessCheck +) +from wxdi.dq_validator.integrations import PandasValidator + + +def main(): + print("=" * 80) + print("CEL Validation with Pandas DataFrames Example") + print("=" * 80) + + # Step 1: Define asset metadata + print("\n[Step 1] Defining Asset Metadata") + print("-" * 80) + + metadata = AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING, length=100), + ColumnMetadata('email', DataType.STRING, length=255), + ColumnMetadata('age', DataType.INTEGER), + ColumnMetadata('department', DataType.STRING, length=50), + ColumnMetadata('salary', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('min_salary', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('bonus', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('status', DataType.STRING, length=20), + ColumnMetadata('years_experience', DataType.INTEGER), + ] + ) + + print(f"Asset: {metadata.table_name}") + print(f"Columns: {len(metadata.columns)}") + + # Step 2: Create validator with CEL-based business rules + print("\n[Step 2] Configuring CEL Validation Rules") + print("-" * 80) + + validator = Validator(metadata) + + # Rule 1: Salary must be positive + print("\n[OK] Rule 1: Salary must be positive") + print(" CEL: 'value > 0'") + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value > 0', + error_message='Salary must be positive' + )) + ) + + # Rule 2: Salary must exceed minimum salary + print("\n[OK] Rule 2: Salary must exceed minimum salary") + print(" CEL: 'value > min_salary'") + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value > min_salary', + error_message='Salary must exceed minimum salary' + )) + ) + + # Rule 3: Age-based salary requirements + print("\n[OK] Rule 3: Age-based salary requirements") + print(" CEL: 'age > 40 ? value >= 80000 : value >= 50000'") + print(" (Senior employees must earn >=$80K, junior >=$50K)") + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='age > 40 ? value >= 80000 : value >= 50000', + error_message='Salary does not meet age-based requirements' + )) + ) + + # Rule 4: Email domain validation + print("\n[OK] Rule 4: Email must be from company domain") + print(" CEL: 'value.endsWith(\"@company.com\")'") + validator.add_rule( + ValidationRule('email') + .add_check(CompletenessCheck(missing_values_allowed=False)) + .add_check(CELCheck( + expression='value.endsWith("@company.com")', + error_message='Email must be from company domain (@company.com)' + )) + ) + + # Rule 5: Status validation + print("\n[OK] Rule 5: Status must be Active, Pending, or Approved") + print(" CEL: 'value in [\"Active\", \"Pending\", \"Approved\"]'") + validator.add_rule( + ValidationRule('status') + .add_check(CELCheck( + expression='value in ["Active", "Pending", "Approved"]', + error_message='Invalid status value' + )) + ) + + # Rule 6: Department-based bonus limits + print("\n[OK] Rule 6: Department-based bonus limits") + print(" CEL: 'department == \"Sales\" ? value <= 20000 : value <= 10000'") + print(" (Sales: <=$20K, Others: <=$10K)") + validator.add_rule( + ValidationRule('bonus') + .add_check(CELCheck( + expression='department == "Sales" ? value <= 20000 : value <= 10000', + error_message='Bonus exceeds department limit' + )) + ) + + # Rule 7: Experience-based salary validation + print("\n[OK] Rule 7: Salary must match experience level") + print(" CEL: 'value >= 40000 + (years_experience * 5000)'") + print(" (Base $40K + $5K per year of experience)") + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value >= 40000 + (years_experience * 5000)', + error_message='Salary too low for experience level' + )) + ) + + # Rule 8: Sales age requirement + print("\n[OK] Rule 8: Sales employees must be at least 21") + print(" CEL: 'value >= 21 || department != \"Sales\"'") + validator.add_rule( + ValidationRule('age') + .add_check(CELCheck( + expression='value >= 21 || department != "Sales"', + error_message='Sales employees must be at least 21 years old' + )) + ) + + print(f"\n[OK] Validator configured with {len(validator.rules)} rules") + + # Step 3: Create sample DataFrame + print("\n[Step 3] Creating Sample DataFrame") + print("-" * 80) + + df = pd.DataFrame({ + 'emp_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008], + 'name': [ + 'John Doe', 'Jane Smith', 'Bob Wilson', 'Alice Brown', + 'Charlie Davis', 'Eve Martinez', 'Frank Lee', 'Grace Kim' + ], + 'email': [ + 'john@company.com', 'jane@other.com', 'bob@company.com', 'alice@company.com', + 'charlie@company.com', 'eve@company.com', 'frank@company.com', 'grace@company.com' + ], + 'age': [30, 45, 20, 50, 35, 28, 42, 38], + 'department': [ + 'Engineering', 'Sales', 'Sales', 'Engineering', + 'Sales', 'HR', 'Engineering', 'Finance' + ], + 'salary': [75000.00, 85000.00, 55000.00, 70000.00, 90000.00, 62000.00, 95000.00, 78000.00], + 'min_salary': [60000.00, 70000.00, 50000.00, 60000.00, 70000.00, 55000.00, 75000.00, 65000.00], + 'bonus': [5000.00, 18000.00, 8000.00, 12000.00, 25000.00, 7000.00, 9000.00, 8500.00], + 'status': ['Active', 'Active', 'Pending', 'Inactive', 'Active', 'Approved', 'Active', 'Pending'], + 'years_experience': [5, 15, 2, 20, 10, 4, 12, 8] + }) + + print(f"\nDataFrame created with {len(df)} rows and {len(df.columns)} columns") + print("\nSample data (first 3 rows):") + print(df.head(3).to_string(index=False)) + + # Step 4: Create Pandas validator + print("\n[Step 4] Creating Pandas Validator") + print("-" * 80) + + pandas_validator = PandasValidator(validator, chunk_size=1000) + print(f"[OK] {pandas_validator}") + + # Step 5: Get summary statistics + print("\n[Step 5] Validation Summary Statistics") + print("-" * 80) + + summary = pandas_validator.get_summary_statistics(df) + print(f"\nTotal Rows: {summary['total_rows']}") + print(f"Valid Rows: {summary['valid_rows']} ({summary['pass_rate']:.1f}%)") + print(f"Invalid Rows: {summary['invalid_rows']}") + print(f"Total Checks: {summary['total_checks']}") + print(f"Passed Checks: {summary['passed_checks']}") + print(f"Failed Checks: {summary['failed_checks']}") + + # Step 6: Add validation column + print("\n[Step 6] Adding Validation Results to DataFrame") + print("-" * 80) + + df_validated = pandas_validator.add_validation_column(df) + + print(f"\n[OK] Validation column added: '{pandas_validator.result_column_name}'") + print(f"[OK] Total columns: {len(df_validated.columns)}") + + # Display validation results + print("\nValidation Results by Row:") + print("-" * 80) + for idx, row in df_validated.iterrows(): + result = row['dq_validation_result'] + is_valid = bool(result['is_valid']) + status = "[PASS]" if is_valid else "[FAIL]" + print(f"Row {idx}: {status} | {row['name']:20s} | Score: {str(result['score']):>6s} | " + f"Pass Rate: {result['pass_rate']:6.1f}% | Errors: {result['error_count']}") + + # Step 7: Analyze invalid rows + print("\n[Step 7] Analyzing Invalid Rows") + print("-" * 80) + + invalid_df = pandas_validator.get_invalid_rows(df) + + if len(invalid_df) > 0: + print(f"\nFound {len(invalid_df)} invalid row(s):\n") + + for idx, row in invalid_df.iterrows(): + validation = row['dq_validation_result'] + print(f"Row {idx}: {row['name']} ({row['department']})") + print(f" Age: {row['age']}, Salary: ${row['salary']:,.2f}, Bonus: ${row['bonus']:,.2f}") + print(f" Email: {row['email']}, Status: {row['status']}") + print(f" Validation Score: {validation['score']} ({validation['pass_rate']:.1f}%)") + print(f" Failed Checks: {validation['failed_checks']}/{validation['total_checks']}") + + # Parse and display errors + import json + errors = validation['errors'] + error_count = len(errors) if isinstance(errors, list) else 0 + if error_count > 0: + print(f" Errors:") + for error_json in errors: + error = json.loads(error_json) + print(f" - {error['column']}: {error['message']}") + print() + else: + print("\n[OK] All rows passed validation!") + + # Step 8: Expand validation columns for analysis + print("\n[Step 8] Expanding Validation Columns") + print("-" * 80) + + df_expanded = pandas_validator.expand_validation_column(df_validated) + + print(f"\n[OK] Validation struct expanded into separate columns") + print(f"[OK] New columns: {[c for c in df_expanded.columns if c.startswith('dq_')]}") + + # Show expanded validation data + print("\nExpanded Validation Data:") + validation_cols = ['name', 'department', 'dq_is_valid', 'dq_score', + 'dq_pass_rate', 'dq_error_count'] + print(df_expanded[validation_cols].to_string(index=False)) + + # Step 9: Filter and analyze by department + print("\n[Step 9] Department-Level Analysis") + print("-" * 80) + + dept_analysis = df_expanded.groupby('department').agg({ + 'dq_is_valid': ['sum', 'count'], + 'dq_pass_rate': 'mean', + 'dq_error_count': 'sum' + }).round(2) + + dept_analysis.columns = ['Valid_Rows', 'Total_Rows', 'Avg_Pass_Rate', 'Total_Errors'] + dept_analysis['Pass_Rate_%'] = (dept_analysis['Valid_Rows'] / dept_analysis['Total_Rows'] * 100).round(1) + + print("\nValidation Statistics by Department:") + print(dept_analysis.to_string()) + + # Step 10: Get detailed statistics + print("\n[Step 10] Detailed Validation Statistics") + print("-" * 80) + + consolidator = pandas_validator.get_detailed_statistics(df) + + print("\nOverall Statistics:") + overall = consolidator.get_overall_statistics() + print(f" Total Records: {overall['total_records']}") + print(f" Valid Records: {overall['valid_records']} ({overall['pass_rate']:.1f}%)") + print(f" Invalid Records: {overall['invalid_records']}") + print(f" Total Errors: {overall['total_errors']}") + + print("\nStatistics by Column:") + for column in consolidator.get_columns(): + stats = consolidator.get_column_statistics(column) + if stats['total'] > 0: + pass_rate = (stats['passed'] / stats['total'] * 100) if stats['total'] > 0 else 0.0 + print(f" {column:20s}: {stats['passed']:2d}/{stats['total']:2d} passed " + f"({pass_rate:5.1f}%) - {stats['failed']} failed") + + print("\nStatistics by Check Type:") + for check in consolidator.get_checks(): + stats = consolidator.get_check_statistics(check) + if stats['total'] > 0: + pass_rate = (stats['passed'] / stats['total'] * 100) if stats['total'] > 0 else 0.0 + print(f" {check:30s}: {stats['passed']:2d}/{stats['total']:2d} passed " + f"({pass_rate:5.1f}%)") + + # Step 11: Save results + print("\n[Step 11] Saving Results") + print("-" * 80) + + # Save invalid rows + if len(invalid_df) > 0: + invalid_df.to_csv('cel_invalid_employees.csv', index=False) + print("[OK] Saved invalid rows to: cel_invalid_employees.csv") + + # Save expanded results + df_expanded.to_csv('cel_validation_results.csv', index=False) + print("[OK] Saved validation results to: cel_validation_results.csv") + + # Save department analysis + dept_analysis.to_csv('cel_department_analysis.csv') + print("[OK] Saved department analysis to: cel_department_analysis.csv") + + # Step 12: CEL Expression Tips + print("\n" + "=" * 80) + print("CEL Expression Tips for Pandas DataFrames") + print("=" * 80) + print(""" +1. Simple Syntax (Recommended): + - Direct column access: 'salary > min_salary' + - No 'record.' prefix needed: 'age > 40' + - Cleaner and more readable + +2. Available Variables: + - value: Current column value being validated + - Column names: Direct access to any column (e.g., age, salary, department) + - column_name: Name of the column being validated + - record_index: Position of the record in the batch + +3. Supported Operations: + - Comparisons: ==, !=, <, <=, >, >= + - Logical: &&, ||, ! + - Arithmetic: +, -, *, /, % + - Ternary: condition ? true_value : false_value + - String: .startsWith(), .endsWith(), .contains() + - List: in, not in + +4. Complex Business Rules: + - Multi-column: 'salary > min_salary && bonus < salary * 0.3' + - Conditional: 'age > 40 ? salary >= 80000 : salary >= 50000' + - Department-based: 'department == "Sales" ? value <= 20000 : value <= 10000' + +5. Performance Optimization: + - CEL automatically extracts only required columns from wide tables + - Chunked processing handles large DataFrames efficiently + - Memory usage: O(chunk_size) instead of O(n) + +6. Case Sensitivity: + WARNING: Column names are CASE-SENSITIVE + - 'salary' != 'Salary' != 'SALARY' + - Use exact column names from metadata +""") + + print("\n" + "=" * 80) + print("Example Complete!") + print("=" * 80) + + +if __name__ == '__main__': + try: + main() + except ImportError as e: + print(f"Error: {e}") + print("\nTo run this example, install required dependencies:") + print(" pip install pandas cel-python") + print("Or install with all integrations:") + print(" pip install wxdi[pandas]") + +# Made with Bob diff --git a/examples/cel_usage.py b/examples/cel_usage.py new file mode 100644 index 0000000..e02ff70 --- /dev/null +++ b/examples/cel_usage.py @@ -0,0 +1,321 @@ +""" + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +""" +CEL (Common Expression Language) Validation Example + +This example demonstrates how to use CEL expressions for custom validation +logic in the IBM watsonx.data Intelligence SDK. + +CEL provides flexible, safe expression evaluation for complex business rules +that go beyond the capabilities of predefined validation checks. + +SYNTAX OPTIONS: +- Simple Syntax (RECOMMENDED): 'value > min_salary', 'age > 40' + Column names can be referenced directly without 'record.' prefix + +- Explicit Syntax (still supported): 'value > record.min_salary', 'record.age > 40' + Use 'record.' prefix for explicit column access + +Both syntaxes work identically and can be mixed in the same validation rules. +""" + +from wxdi.dq_validator import ( + AssetMetadata, ColumnMetadata, DataType, + Validator, ValidationRule, + CELCheck, RangeCheck, CompletenessCheck +) + + +def main(): + print("=" * 70) + print("CEL (Common Expression Language) Validation Example") + print("=" * 70) + + # Step 1: Define asset metadata + metadata = AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING, length=100), + ColumnMetadata('email', DataType.STRING, length=255), + ColumnMetadata('age', DataType.INTEGER), + ColumnMetadata('department', DataType.STRING, length=50), + ColumnMetadata('salary', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('min_salary', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('bonus', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('status', DataType.STRING, length=20), + ] + ) + + print(f"\nAsset: {metadata.table_name}") + print(f"Columns: {len(metadata.columns)}") + + # Step 2: Create validator with CEL checks + validator = Validator(metadata) + + # Example 1: Simple value validation + print("\n" + "=" * 70) + print("Example 1: Simple Value Validation") + print("=" * 70) + print("Rule: Salary must be positive") + print("CEL Expression: 'value > 0'") + + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value > 0', + error_message='Salary must be positive' + )) + ) + + # Example 2: Multi-column comparison (SIMPLE SYNTAX) + print("\n" + "=" * 70) + print("Example 2: Multi-Column Comparison (Simple Syntax)") + print("=" * 70) + print("Rule: Salary must be greater than minimum salary") + print("Simple Syntax: 'value > min_salary'") + print("(Also works: 'value > record.min_salary')") + + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value > min_salary', # Simple syntax + error_message='Salary must exceed minimum salary' + )) + ) + + # Example 3: Complex business logic with conditional (SIMPLE SYNTAX) + print("\n" + "=" * 70) + print("Example 3: Age-Based Salary Requirements (Simple Syntax)") + print("=" * 70) + print("Rule: Senior employees (age > 40) must earn at least $80,000") + print(" Junior employees (age <= 40) must earn at least $50,000") + print("Simple Syntax: 'age > 40 ? value >= 80000 : value >= 50000'") + print("(Also works: 'record.age > 40 ? value >= 80000 : value >= 50000')") + + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='age > 40 ? value >= 80000 : value >= 50000', # Simple syntax + error_message='Salary does not meet age-based requirements' + )) + ) + + # Example 4: String operations + print("\n" + "=" * 70) + print("Example 4: Email Domain Validation") + print("=" * 70) + print("Rule: Email must be from company domain") + print("CEL Expression: 'value.endsWith(\"@company.com\")'") + + validator.add_rule( + ValidationRule('email') + .add_check(CELCheck( + expression='value.endsWith("@company.com")', + error_message='Email must be from company domain (@company.com)' + )) + ) + + # Example 5: List membership + print("\n" + "=" * 70) + print("Example 5: Status Validation") + print("=" * 70) + print("Rule: Status must be one of: Active, Pending, Approved") + print("CEL Expression: 'value in [\"Active\", \"Pending\", \"Approved\"]'") + + validator.add_rule( + ValidationRule('status') + .add_check(CELCheck( + expression='value in ["Active", "Pending", "Approved"]', + error_message='Invalid status value' + )) + ) + + # Example 6: Department-based bonus limits (SIMPLE SYNTAX) + print("\n" + "=" * 70) + print("Example 6: Department-Based Bonus Limits (Simple Syntax)") + print("=" * 70) + print("Rule: Sales can have bonus up to $20K, others up to $10K") + print("Simple Syntax: 'department == \"Sales\" ? value <= 20000 : value <= 10000'") + print("(Also works: 'record.department == \"Sales\" ? value <= 20000 : value <= 10000')") + + validator.add_rule( + ValidationRule('bonus') + .add_check(CELCheck( + expression='department == "Sales" ? value <= 20000 : value <= 10000', # Simple syntax + error_message='Bonus exceeds department limit' + )) + ) + + # Example 7: Arithmetic with simple syntax + print("\n" + "=" * 70) + print("Example 7: Arithmetic Operations (Simple Syntax)") + print("=" * 70) + print("Rule: Salary must be at least 20% above minimum") + print("Simple Syntax: 'value >= min_salary * 1.2'") + + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression='value >= min_salary * 1.2', # Simple syntax + error_message='Salary must be at least 20% above minimum' + )) + ) + + # Example 8: Combining CEL with other checks (SIMPLE SYNTAX) + print("\n" + "=" * 70) + print("Example 8: Combining CEL with Other Checks") + print("=" * 70) + print("Combining: CompletenessCheck + RangeCheck + CELCheck") + print("Simple Syntax: 'value >= 21 || department != \"Sales\"'") + + validator.add_rule( + ValidationRule('age') + .add_check(CompletenessCheck(missing_values_allowed=False)) + .add_check(RangeCheck(min_value=18, max_value=65)) + .add_check(CELCheck( + expression='value >= 21 || department != "Sales"', # Simple syntax + error_message='Sales employees must be at least 21 years old' + )) + ) + + print(f"\nValidator configured with {len(validator.rules)} rules") + + # Step 3: Test with sample records + print("\n" + "=" * 70) + print("Validating Sample Records") + print("=" * 70) + + records = [ + # [emp_id, name, email, age, department, salary, min_salary, bonus, status] + [1001, 'John Doe', 'john@company.com', 30, 'Engineering', 75000.00, 60000.00, 5000.00, 'Active'], + [1002, 'Jane Smith', 'jane@other.com', 45, 'Sales', 85000.00, 70000.00, 18000.00, 'Active'], + [1003, 'Bob Wilson', 'bob@company.com', 20, 'Sales', 55000.00, 50000.00, 8000.00, 'Pending'], + [1004, 'Alice Brown', 'alice@company.com', 50, 'Engineering', 70000.00, 60000.00, 12000.00, 'Inactive'], + [1005, 'Charlie Davis', 'charlie@company.com', 35, 'Sales', 90000.00, 70000.00, 25000.00, 'Active'], + ] + + results = validator.validate_batch(records) + + # Step 4: Display results + for idx, result in enumerate(results): + record = records[idx] + status_symbol = '[PASS]' if result.is_valid else '[FAIL]' + + print(f"\nRecord {idx + 1}: {status_symbol}") + print(f" Employee: {record[1]} ({record[4]})") + print(f" Age: {record[3]}, Salary: ${record[5]:,.2f}, Bonus: ${record[7]:,.2f}") + print(f" Score: {result.score}, Pass Rate: {result.pass_rate:.1f}%") + + if not result.is_valid: + print(f" Errors ({len(result.errors)}):") + for error in result.errors: + print(f" - {error.column_name}: {error.message}") + + # Step 5: Summary statistics + print("\n" + "=" * 70) + print("Summary") + print("=" * 70) + + total_records = len(results) + valid_records = sum(1 for r in results if r.is_valid) + invalid_records = total_records - valid_records + overall_pass_rate = (valid_records / total_records) * 100 + + print(f"Total Records: {total_records}") + print(f"Valid Records: {valid_records}") + print(f"Invalid Records: {invalid_records}") + print(f"Overall Pass Rate: {overall_pass_rate:.1f}%") + + # Step 6: CEL Expression Tips + print("\n" + "=" * 70) + print("CEL Expression Tips") + print("=" * 70) + print(""" +1. Available Variables: + - value: Current column value + - record: Dictionary of all column values (e.g., record.age, record.salary) + - column_name: Name of the column being validated + - record_index: Position of the record in the batch + +2. Supported Operators: + - Comparison: ==, !=, <, <=, >, >= + - Logical: &&, ||, ! + - Arithmetic: +, -, *, /, % + - String: contains, startsWith, endsWith, matches + - List: in, size, all, exists + - Ternary: condition ? true_value : false_value + +3. Best Practices: + - Keep expressions simple and readable + - Use descriptive error messages + - Test expressions with sample data + - Combine with other checks for comprehensive validation + - Use ternary operator for conditional logic + +4. Performance: + - Expressions are compiled once at initialization + - Evaluation is fast (~10-100 microseconds per record) + - Suitable for high-throughput validation + """) + + print("\n" + "=" * 70) + + # Example 9: Variable Bindings for Reusable Templates + print("\n" + "=" * 70) + print("Example 9: Variable Bindings (Reusable Templates)") + print("=" * 70) + print("Create reusable validation templates with generic variable names") + print("that map to actual column names via bindings.") + + # Create a reusable template + range_template = 'current > minimum' + + # Apply to salary + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck( + expression=range_template, + bindings={'current': 'salary', 'minimum': 'min_salary'}, + error_message='Salary below minimum' + )) + ) + + # Apply same template to bonus with different bindings + # (Note: This would need a min_bonus column in real usage) + print("\nSame template, different columns:") + print(" Salary check: bindings={'current': 'salary', 'minimum': 'min_salary'}") + print(" Bonus check: bindings={'current': 'bonus', 'minimum': 'min_bonus'}") + print("\nBenefits:") + print(" - Write validation logic once, reuse many times") + print(" - Update template in one place, affects all uses") + print(" - Generic names make intent clearer") + print(" - Backward compatible (bindings are optional)") + print("Example Complete!") + print("=" * 70) + + +if __name__ == '__main__': + try: + main() + except ImportError as e: + print(f"Error: {e}") + print("\nTo run this example, install cel-python:") + print(" pip install cel-python>=0.5.0") + print("Or install the full SDK:") + print(" pip install data-intelligence-sdk") + +# Made with Bob diff --git a/examples/table_cel_usage.py b/examples/table_cel_usage.py new file mode 100644 index 0000000..5124034 --- /dev/null +++ b/examples/table_cel_usage.py @@ -0,0 +1,318 @@ +""" + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +""" +Table-Level CEL (Common Expression Language) Validation Example + +This example demonstrates how to use table-level CEL expressions for +cross-column validation and complex business rules that span multiple fields. + +KEY DIFFERENCES FROM COLUMN-LEVEL CEL: +- Column-level: Validates individual column values (e.g., 'value > 0') +- Table-level: Validates entire records (e.g., 'salary > min_salary && age >= 18') + +WHEN TO USE TABLE-LEVEL CEL: +- Cross-column validation (start_date < end_date) +- Complex business rules spanning multiple fields +- Conditional logic based on multiple columns +- Record-level consistency checks +""" + +from wxdi.dq_validator import ( + AssetMetadata, ColumnMetadata, DataType, + Validator, ValidationRule, TableValidationRule, + CELCheck, TableCELCheck, CompletenessCheck +) + + +def main(): + print("=" * 70) + print("Table-Level CEL Validation Example") + print("=" * 70) + + # Step 1: Define asset metadata + metadata = AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING, length=100), + ColumnMetadata('email', DataType.STRING, length=255), + ColumnMetadata('age', DataType.INTEGER), + ColumnMetadata('department', DataType.STRING, length=50), + ColumnMetadata('salary', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('min_salary', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('bonus', DataType.DECIMAL, precision=10, scale=2), + ColumnMetadata('start_date', DataType.STRING, length=10), + ColumnMetadata('end_date', DataType.STRING, length=10), + ] + ) + + print(f"\nAsset: {metadata.table_name}") + print(f"Columns: {len(metadata.columns)}") + + # Step 2: Create validator with table-level CEL checks + validator = Validator(metadata) + + # Example 1: Multi-column comparison + print("\n" + "=" * 70) + print("Example 1: Multi-Column Salary Validation") + print("=" * 70) + print("Rule: Salary must exceed minimum salary") + print("Table CEL: 'salary > min_salary'") + + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(TableCELCheck( + 'salary > min_salary', + error_message='Salary must exceed minimum salary' + )) + ) + + # Example 2: Complex age-based business rules + print("\n" + "=" * 70) + print("Example 2: Age-Based Salary Requirements") + print("=" * 70) + print("Rule: Senior employees (age > 40) must earn at least $80,000") + print(" Junior employees (age <= 40) must earn at least $50,000") + print("Table CEL: 'age > 40 ? salary >= 80000 : salary >= 50000'") + + validator.add_table_rule( + TableValidationRule('age_salary_check') + .add_check(TableCELCheck( + 'age > 40 ? salary >= 80000 : salary >= 50000', + error_message='Salary does not meet age-based requirements' + )) + ) + + # Example 3: Department-specific rules + print("\n" + "=" * 70) + print("Example 3: Department-Specific Validation") + print("=" * 70) + print("Rule: Sales employees must be at least 21 years old") + print("Table CEL: 'department == \"Sales\" ? age >= 21 : true'") + + validator.add_table_rule( + TableValidationRule('sales_age_check') + .add_check(TableCELCheck( + 'department == "Sales" ? age >= 21 : true', + error_message='Sales employees must be at least 21 years old' + )) + ) + + # Example 4: Bonus limits by department + print("\n" + "=" * 70) + print("Example 4: Department-Based Bonus Limits") + print("=" * 70) + print("Rule: Sales can have bonus up to $20K, others up to $10K") + print("Table CEL: 'department == \"Sales\" ? bonus <= 20000 : bonus <= 10000'") + + validator.add_table_rule( + TableValidationRule('bonus_limit_check') + .add_check(TableCELCheck( + 'department == "Sales" ? bonus <= 20000 : bonus <= 10000', + error_message='Bonus exceeds department limit' + )) + ) + + # Example 5: Date consistency + print("\n" + "=" * 70) + print("Example 5: Date Consistency Check") + print("=" * 70) + print("Rule: Start date must be before end date") + print("Table CEL: 'start_date < end_date'") + + validator.add_table_rule( + TableValidationRule('date_consistency') + .add_check(TableCELCheck( + 'start_date < end_date', + error_message='Start date must be before end date' + )) + ) + + # Example 6: Complex multi-field validation + print("\n" + "=" * 70) + print("Example 6: Complex Multi-Field Validation") + print("=" * 70) + print("Rule: Total compensation (salary + bonus) must be reasonable") + print("Table CEL: 'salary + bonus <= min_salary * 2.5'") + + validator.add_table_rule( + TableValidationRule('total_comp_check') + .add_check(TableCELCheck( + 'salary + bonus <= min_salary * 2.5', + error_message='Total compensation exceeds 2.5x minimum salary' + )) + ) + + # Example 7: Combining column-level and table-level rules + print("\n" + "=" * 70) + print("Example 7: Combining Column and Table Rules") + print("=" * 70) + print("Column Rule: Email must not be null") + print("Table Rule: Email domain must match department") + + # Column-level: Basic completeness check + validator.add_rule( + ValidationRule('email') + .add_check(CompletenessCheck(missing_values_allowed=False)) + ) + + # Table-level: Cross-field validation + validator.add_table_rule( + TableValidationRule('email_domain_check') + .add_check(TableCELCheck( + 'department == "Sales" ? email.endsWith("@sales.company.com") : email.endsWith("@company.com")', + error_message='Email domain does not match department' + )) + ) + + print(f"\nValidator configured with:") + print(f" - {len(validator.rules)} column-level rules") + print(f" - {len(validator.table_rules)} table-level rules") + + # Step 3: Test with sample records + print("\n" + "=" * 70) + print("Validating Sample Records") + print("=" * 70) + + records = [ + # [emp_id, name, email, age, department, salary, min_salary, bonus, start_date, end_date] + [1001, 'John Doe', 'john@company.com', 30, 'Engineering', 75000.00, 60000.00, 5000.00, '2020-01-01', '2025-12-31'], + [1002, 'Jane Smith', 'jane@sales.company.com', 45, 'Sales', 85000.00, 70000.00, 18000.00, '2019-06-15', '2024-06-15'], + [1003, 'Bob Wilson', 'bob@sales.company.com', 20, 'Sales', 55000.00, 50000.00, 8000.00, '2021-03-01', '2026-03-01'], + [1004, 'Alice Brown', 'alice@company.com', 50, 'Engineering', 70000.00, 60000.00, 12000.00, '2018-09-01', '2023-09-01'], + [1005, 'Charlie Davis', 'charlie@sales.company.com', 35, 'Sales', 90000.00, 70000.00, 25000.00, '2020-11-01', '2025-11-01'], + [1006, 'Eve Martinez', 'eve@company.com', 28, 'HR', 62000.00, 55000.00, 7000.00, '2022-01-15', '2027-01-15'], + [1007, 'Frank Lee', 'frank@company.com', 42, 'Engineering', 95000.00, 75000.00, 9000.00, '2017-04-01', '2022-04-01'], + [1008, 'Grace Kim', 'grace@company.com', 38, 'Finance', 78000.00, 65000.00, 8500.00, '2025-01-01', '2024-01-01'], # Invalid: end_date < start_date + ] + + results = validator.validate_batch(records) + + # Step 4: Display results + for idx, result in enumerate(results): + record = records[idx] + status_symbol = '[PASS]' if result.is_valid else '[FAIL]' + + print(f"\nRecord {idx + 1}: {status_symbol}") + print(f" Employee: {record[1]} ({record[4]})") + print(f" Age: {record[3]}, Salary: ${record[5]:,.2f}, Bonus: ${record[7]:,.2f}") + print(f" Dates: {record[8]} to {record[9]}") + print(f" Score: {result.score}, Pass Rate: {result.pass_rate:.1f}%") + + if not result.is_valid: + print(f" Errors ({len(result.errors)}):") + for error in result.errors: + print(f" - {error.column_name}: {error.message}") + + # Step 5: Summary statistics + print("\n" + "=" * 70) + print("Summary") + print("=" * 70) + + total_records = len(results) + valid_records = sum(1 for r in results if r.is_valid) + invalid_records = total_records - valid_records + overall_pass_rate = (valid_records / total_records) * 100 + + print(f"Total Records: {total_records}") + print(f"Valid Records: {valid_records}") + print(f"Invalid Records: {invalid_records}") + print(f"Overall Pass Rate: {overall_pass_rate:.1f}%") + + # Step 6: Key Takeaways + print("\n" + "=" * 70) + print("Key Takeaways: Table-Level vs Column-Level CEL") + print("=" * 70) + print(""" +COLUMN-LEVEL CEL (CELCheck): +- Validates individual column values +- Has access to 'value' variable (current column) +- Example: CELCheck('value > 0') +- Use for: Single-column validation + +TABLE-LEVEL CEL (TableCELCheck): +- Validates entire records +- NO 'value' variable (no single column focus) +- Direct access to all columns +- Example: TableCELCheck('salary > min_salary && age >= 18') +- Use for: Cross-column validation, complex business rules + +WHEN TO USE EACH: ++------------------------------------------------------------------+ +| Column-Level CEL | Table-Level CEL | ++----------------------------------+--------------------------------+ +| - Single column validation | - Cross-column validation | +| - Value range checks | - Multi-field business rules | +| - Format validation | - Conditional logic | +| - Simple comparisons | - Date consistency | +| | - Complex calculations | ++----------------------------------+--------------------------------+ + +BEST PRACTICES: +1. Use column-level for simple, single-field checks +2. Use table-level for cross-field validation +3. Combine both for comprehensive validation +4. Keep expressions readable and maintainable +5. Use descriptive rule names for error tracking +""") + + # Example 8: Variable Bindings for Reusable Table Rules + print("\n" + "=" * 70) + print("Example 8: Variable Bindings (Reusable Table Rules)") + print("=" * 70) + print("Create reusable table-level validation templates with generic") + print("variable names that map to actual column names via bindings.") + + # Create a reusable template for age-based validation + age_based_template = 'person_age >= min_age && compensation > minimum' + + # Apply with bindings + validator.add_table_rule( + TableValidationRule('eligibility_check') + .add_check(TableCELCheck( + expression=age_based_template, + bindings={ + 'person_age': 'age', + 'min_age': 'age', # Could map to different column in other contexts + 'compensation': 'salary', + 'minimum': 'min_salary' + }, + error_message='Employee does not meet eligibility requirements' + )) + ) + + print("\nTemplate: 'person_age >= min_age && compensation > minimum'") + print("Bindings: {'person_age': 'age', 'compensation': 'salary', ...}") + print("\nBenefits:") + print(" - Reusable across different data contexts") + print(" - Generic names clarify business intent") + print(" - Same template for different column combinations") + print(" - Backward compatible (bindings are optional)") + + print("\n" + "=" * 70) + print("Example Complete!") + print("=" * 70) + + +if __name__ == '__main__': + try: + main() + except ImportError as e: + print(f"Error: {e}") + print("\nTo run this example, install required dependencies:") + print(" pip install cel-python>=0.5.0") + +# Made with Bob diff --git a/requirements-dev.txt b/requirements-dev.txt index 0da4263..29478f3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,3 +15,7 @@ ibm-cloud-sdk-core==3.24.4 black>=26.3.1 pylint>=3.0.0 +pytest>=9.0.3 +pytest-cov>=4.0.0 +pytest-mock>=3.7.0 +responses>=0.20.0 diff --git a/requirements.txt b/requirements.txt index 0951bc4..d9a35ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,8 +13,11 @@ # limitations under the License. # Core dependencies (defined in setup.py, included here for development convenience) +# Note: pydantic, requests, regex, urllib3, python-dateutil, PyJWT, pyyaml, numpy are defined in setup.py +# Note: cel-python is defined in setup.py pydantic>=2.12.0 -requests>=2.32.4 +# Uncomment to include +# requests>=2.33.1 regex>=2023.0.0 urllib3>=2.6.3 python-dateutil>=2.5.3,<3.0.0 @@ -24,10 +27,7 @@ numpy>=1.24.0 # Note: ibm-cloud-sdk-core is defined in setup.py with exact version pin # Development dependencies -pytest>=7.0.0 -pytest-cov>=4.0.0 -pytest-mock>=3.7.0 -responses>=0.20.0 +# pytest and related dependencies are in requirements-dev.txt # black is defined in setup.py extras_require['dev'] to avoid BOM conflicts mypy>=1.0.0 flake8>=6.0.0 diff --git a/setup.py b/setup.py index 5c928a1..a7c096a 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ setup( name="data-intelligence-sdk", - version='2.0.0', + version='2.1.0-rc.1', author="IBM", author_email="Data_Intelligence_SDK@wwpdl.vnet.ibm.com", description="A Python SDK for IBM watsonx.data intelligence that provides data quality validation for streaming records and DataFrames, " \ @@ -59,6 +59,7 @@ "python-dateutil>=2.5.3,<3.0.0", "pyyaml>=5.4.0,<7.0.0", "numpy>=1.24.0", + "cel-python>=0.5.0", # Pinned to exact version to avoid CRA bom-generate pip resolver conflict. # CRA sees ibm-cloud-sdk-core from both setup.py and requirements.txt and # fails with ResolutionImpossible when constraints differ (bare vs >=). @@ -66,7 +67,7 @@ ], extras_require={ "dev": [ - "pytest>=7.0.0", + "pytest>=9.0.3", "pytest-cov>=4.0.0", "pytest-mock>=3.7.0", "responses>=0.20.0", diff --git a/src/wxdi/data_product_recommender/README.md b/src/wxdi/data_product_recommender/README.md index 5c71fbc..2e8f173 100644 --- a/src/wxdi/data_product_recommender/README.md +++ b/src/wxdi/data_product_recommender/README.md @@ -70,8 +70,8 @@ python -m wxdi.data_product_recommender.cli \ ### Python API ```python -from data_product_recommender.platforms import SnowflakeQueryParser -from data_product_recommender.recommender import DataProductRecommender +from wxdi.data_product_recommender.platforms import SnowflakeQueryParser +from wxdi.data_product_recommender.recommender import DataProductRecommender # Initialize with platform-specific parser parser = SnowflakeQueryParser() diff --git a/src/wxdi/data_product_recommender/cli.py b/src/wxdi/data_product_recommender/cli.py index d884a90..4fb136b 100644 --- a/src/wxdi/data_product_recommender/cli.py +++ b/src/wxdi/data_product_recommender/cli.py @@ -33,6 +33,7 @@ def main(): + """Main entry point for the data product recommender CLI.""" parser = argparse.ArgumentParser( description='Analyze query logs to recommend data products' ) diff --git a/src/wxdi/data_product_recommender/recommender.py b/src/wxdi/data_product_recommender/recommender.py index 65f9732..f9a7e95 100644 --- a/src/wxdi/data_product_recommender/recommender.py +++ b/src/wxdi/data_product_recommender/recommender.py @@ -944,14 +944,13 @@ def _get_star_rating(self, score: float) -> tuple: """Get star rating and label for a score""" if score >= 80: return FIVE_STARS, EXCELLENT_CANDIDATE - elif score >= 60: + if score >= 60: return FOUR_STARS, GOOD_CANDIDATE - elif score >= 40: + if score >= 40: return THREE_STARS, FAIR_CANDIDATE - elif score >= 20: + if score >= 20: return TWO_STARS, WEAK_CANDIDATE - else: - return ONE_STAR, POOR_CANDIDATE + return ONE_STAR, POOR_CANDIDATE def _merge_and_sort_products(self, recommendations: dict) -> list: """Merge groups and standalone tables, sorted by score""" @@ -1126,7 +1125,6 @@ def export_recommendations_markdown(self, recommendations: dict, output_file: st def _build_json_metadata(self, recommendations: dict) -> dict: """Build metadata section for JSON export""" - from datetime import datetime return { "generated_at": datetime.now().isoformat(), "total_queries_analyzed": len(self.query_logs) if self.query_logs is not None else 0, @@ -1259,8 +1257,6 @@ def _process_non_clustered_recommendations(self, recommendations: dict) -> list: def export_recommendations_json(self, recommendations: dict, output_file: str): """Export recommendations to JSON file for agent consumption""" - import json - output = { "recommendations": [], "metadata": self._build_json_metadata(recommendations) @@ -1280,13 +1276,12 @@ def _get_rating_label(self, score: float) -> str: """Convert numeric score to rating label""" if score >= 80: return "excellent" - elif score >= 60: + if score >= 60: return "good" - elif score >= 40: + if score >= 40: return "fair" - elif score >= 20: + if score >= 20: return "weak" - else: - return "poor" + return "poor" # Made with Bob diff --git a/src/wxdi/dph_services/README.md b/src/wxdi/dph_services/README.md index 3610050..ca23656 100644 --- a/src/wxdi/dph_services/README.md +++ b/src/wxdi/dph_services/README.md @@ -436,7 +436,7 @@ pytest tests/src/integration/test_dph_v1.py -v ## Requirements -- Python 3.8+ +- Python 3.10+ - ibm-cloud-sdk-core >= 3.16.7 - requests >= 2.32.4 - python-dateutil >= 2.5.3 diff --git a/src/wxdi/dq_validator/__init__.py b/src/wxdi/dq_validator/__init__.py index 41f3ccc..a16e5d3 100644 --- a/src/wxdi/dq_validator/__init__.py +++ b/src/wxdi/dq_validator/__init__.py @@ -25,6 +25,7 @@ from .result import ValidationResult from .result_consolidator import ValidationResultConsolidated from .rule import ValidationRule +from .table_rule import TableValidationRule from .validator import Validator from .checks.length_check import LengthCheck from .checks.valid_values_check import ValidValuesCheck @@ -35,8 +36,11 @@ from .checks.regex_check import RegexCheck from .checks.datatype_check import DataTypeCheck from .checks.format_check import FormatCheck, FormatConstraintType +from .checks.cel_check import CELCheck +from .checks.table_cel_check import TableCELCheck from .datetime_formats import DateTimeFormats from .data_quality_dimension import DataQualityDimension +from .cel_exceptions import CELError, CELCompilationError, CELEvaluationError # Re-export auth module for backward compatibility from wxdi.common.auth import AuthConfig, EnvironmentType, GovCloudAuthenticator, AuthProvider @@ -54,6 +58,7 @@ "ValidationResult", "ValidationResultConsolidated", "ValidationRule", + "TableValidationRule", "Validator", # Checks "LengthCheck", @@ -68,6 +73,12 @@ "DataTypeCheck", "FormatCheck", "FormatConstraintType", + "CELCheck", + "TableCELCheck", + # CEL Exceptions + "CELError", + "CELCompilationError", + "CELEvaluationError", # Authentication "AuthConfig", "EnvironmentType", diff --git a/src/wxdi/dq_validator/cel_context.py b/src/wxdi/dq_validator/cel_context.py new file mode 100644 index 0000000..0b9d374 --- /dev/null +++ b/src/wxdi/dq_validator/cel_context.py @@ -0,0 +1,613 @@ +# Copyright 2026 IBM Corporation +# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See the LICENSE file in the project root for license information. + +""" +CEL Context Builder - Converts validation data into CEL-compatible context. + +OVERVIEW: +This module transforms raw validation data (arrays of values) into structured +dictionaries that CEL expressions can evaluate. It enables both simple and +explicit syntax for accessing column values in CEL expressions. + +KEY CONCEPTS: +1. Context Dictionary: A dict containing all variables available to CEL expressions +2. Record Array: List of values in column order, e.g., [1001, 'John', 75000] +3. Metadata: Column definitions that map array positions to column names +4. Dual Syntax Support: Allows both 'min_salary' and 'record.min_salary' + +EXAMPLE TRANSFORMATION: + Input: + - value: 75000 (current column being validated) + - column_name: 'salary' + - record: [1001, 'John', 75000, 60000] + - metadata: columns=['emp_id', 'name', 'salary', 'min_salary'] + + Output Context: + { + 'value': 75000, # Current value being validated + 'column_name': 'salary', # Name of current column + 'record_index': 0, # Position in batch + 'record': { # All columns as dict + 'emp_id': 1001, + 'name': 'John', + 'salary': 75000, + 'min_salary': 60000 + }, + # SIMPLE SYNTAX: Direct column access (added for convenience) + 'emp_id': 1001, + 'name': 'John', + 'salary': 75000, + 'min_salary': 60000 + } + + This allows CEL expressions to use either: + - Simple: 'value > min_salary' + - Explicit: 'value > record.min_salary' +""" + +from typing import Any, Dict, List, Optional +from .metadata import AssetMetadata + +try: + import celpy +except ImportError: + celpy = None + + +class CELContextBuilder: + """ + Builds CEL evaluation context from validation data. + + PURPOSE: + Transforms raw validation data into a structured dictionary that CEL + expressions can evaluate. Supports both simple ('min_salary') and + explicit ('record.min_salary') syntax for accessing column values. + + CONTEXT VARIABLES PROVIDED: + ┌─────────────────┬──────────────────────────────────────────────────────┐ + │ Variable │ Description │ + ├─────────────────┼──────────────────────────────────────────────────────┤ + │ value │ Current column value being validated │ + │ column_name │ Name of the column being validated │ + │ record_index │ Position of record in batch (0-based) │ + │ record │ Dict of all columns: {'col1': val1, 'col2': val2} │ + │ │ Direct access to each column (e.g., min_salary) │ + └─────────────────┴──────────────────────────────────────────────────────┘ + + RESERVED NAMES (cannot be used as column names with simple syntax): + - value, column_name, record_index, record + If your data has columns with these names, use explicit syntax: + 'record.value' instead of 'value' + + USAGE EXAMPLE: + >>> from wxdi.dq_validator import AssetMetadata, ColumnMetadata, DataType + >>> from wxdi.dq_validator.cel_context import CELContextBuilder + >>> + >>> # Define metadata + >>> metadata = AssetMetadata( + ... table_name='employees', + ... columns=[ + ... ColumnMetadata('emp_id', DataType.INTEGER), + ... ColumnMetadata('salary', DataType.DECIMAL), + ... ColumnMetadata('min_salary', DataType.DECIMAL) + ... ] + ... ) + >>> + >>> # Build context from record data + >>> record = [1001, 75000.00, 60000.00] + >>> context = CELContextBuilder.build_context( + ... value=75000.00, + ... column_name='salary', + ... record=record, + ... metadata=metadata, + ... record_index=0 + ... ) + >>> + >>> # Context now contains: + >>> # - value: 75000.00 + >>> # - column_name: 'salary' + >>> # - record_index: 0 + >>> # - record: {'emp_id': 1001, 'salary': 75000.00, 'min_salary': 60000.00} + >>> # - emp_id: 1001 (direct access) + >>> # - salary: 75000.00 (direct access) + >>> # - min_salary: 60000.00 (direct access) + """ + + @staticmethod + def _init_base_context(value: Any, column_name: str, record_index: int) -> Dict[str, Any]: + """Initialize base context with required variables.""" + return { + 'value': value, + 'column_name': column_name, + 'record_index': record_index + } + + @staticmethod + def _add_record_with_metadata( + context: Dict[str, Any], + record: List[Any], + metadata: AssetMetadata, + required_columns: Optional[set], + bindings: Optional[Dict[str, str]] + ) -> None: + """Add record data with metadata to context.""" + RESERVED_NAMES = {'value', 'column_name', 'record_index', 'record'} + + # Convert array to named dictionary + record_dict = CELContextBuilder._build_record_dict(record, metadata) + context['record'] = record_dict + + # Add columns directly to context for simple syntax + CELContextBuilder._add_columns_to_context( + context, record_dict, RESERVED_NAMES, required_columns + ) + + # Apply variable bindings if provided + CELContextBuilder._apply_bindings(context, record_dict, bindings) + + @staticmethod + def _add_record_without_metadata(context: Dict[str, Any], record: List[Any]) -> None: + """Add record data without metadata (positional columns).""" + positional_dict = {f'col_{i}': val for i, val in enumerate(record)} + context['record'] = positional_dict + context.update(positional_dict) + + @staticmethod + def _should_add_column(key: str, reserved_names: set, required_columns: Optional[set]) -> bool: + """Check if column should be added to context.""" + if key in reserved_names: + return False + return required_columns is None or key in required_columns + + @staticmethod + def _add_dict_columns( + context: Dict[str, Any], + record_dict: dict, + reserved_names: set, + required_columns: Optional[set] + ) -> None: + """Add columns from a dict to context.""" + for key, val in record_dict.items(): + if CELContextBuilder._should_add_column(key, reserved_names, required_columns): + context[key] = val + + @staticmethod + def _add_maptype_columns( + context: Dict[str, Any], + record_dict: Any, + reserved_names: set, + required_columns: Optional[set] + ) -> None: + """Add columns from CEL MapType to context.""" + try: + for key in record_dict: + if CELContextBuilder._should_add_column(key, reserved_names, required_columns): + context[key] = record_dict[key] + except (TypeError, AttributeError): + # If iteration fails (not iterable or no __iter__), + # simple syntax won't work but explicit 'record.column' will + pass + + @staticmethod + def _add_columns_to_context( + context: Dict[str, Any], + record_dict: Any, + reserved_names: set, + required_columns: Optional[set] + ) -> None: + """Add columns from record_dict to context, respecting filters.""" + if isinstance(record_dict, dict): + CELContextBuilder._add_dict_columns(context, record_dict, reserved_names, required_columns) + else: + # CEL MapType object - may not support iteration + CELContextBuilder._add_maptype_columns(context, record_dict, reserved_names, required_columns) + + @staticmethod + def _apply_bindings( + context: Dict[str, Any], + record_dict: Any, + bindings: Optional[Dict[str, str]] + ) -> None: + """Apply variable bindings to context.""" + if bindings and isinstance(record_dict, dict): + for var_name, col_name in bindings.items(): + if col_name in record_dict: + context[var_name] = record_dict[col_name] + @staticmethod + def _add_table_record_with_metadata( + context: Dict[str, Any], + record: List[Any], + metadata: AssetMetadata, + required_columns: Optional[set], + bindings: Optional[Dict[str, str]] + ) -> None: + """Add record data with metadata to table context.""" + RESERVED_NAMES = {'record', 'record_index'} + + # Convert array to named dictionary + record_dict = CELContextBuilder._build_record_dict(record, metadata) + context['record'] = record_dict + + # Add columns directly to context for simple syntax + CELContextBuilder._add_columns_to_context( + context, record_dict, RESERVED_NAMES, required_columns + ) + + # Apply variable bindings if provided + CELContextBuilder._apply_bindings(context, record_dict, bindings) + + + @staticmethod + def build_context( + value: Any, + column_name: str, + record: Optional[List[Any]], + metadata: Optional[AssetMetadata], + record_index: int = 0, + required_columns: Optional[set] = None, + bindings: Optional[Dict[str, str]] = None + ) -> Dict[str, Any]: + """ + Build CEL evaluation context from validation data. + + WHAT THIS DOES: + Converts raw validation data into a structured dictionary that CEL + expressions can evaluate. The context includes both required variables + (value, column_name, etc.) and optional column access for convenience. + + OPTIMIZATION FOR WIDE TABLES: + When required_columns is provided, only those specific columns are added + directly to the context (in addition to the full record dict). This is + critical for assets with many columns (e.g., 100+) to: + - Reduce memory usage (avoid copying all column values) + - Improve performance (less data to process) + - Maintain correctness (record dict still has all columns) + + VARIABLE BINDINGS: + When bindings are provided, generic variable names in the expression are + mapped to actual column names. This allows reusable validation templates. + + PARAMETERS: + value: The specific column value being validated (e.g., 75000) + + column_name: Name of the column being validated (e.g., 'salary') + + record: Complete record as array (e.g., [1001, 'John', 75000, 60000]) + + metadata: Column definitions for mapping array positions to names + + record_index: Position of this record in the batch (default: 0) + + required_columns: Set of column names to include in context (optional). + - If None: ALL columns are added directly to context + - If set: ONLY these columns are added directly to context + - The record dict always contains ALL columns regardless + + Example: {'min_salary', 'department'} + This adds only min_salary and department as top-level + variables, but record dict still has all columns. + + bindings: Variable name to column name mapping (optional). + Maps generic variable names to actual column names. + Example: {'current_value': 'salary', 'minimum': 'min_salary'} + Expression 'current_value > minimum' becomes 'salary > min_salary' + + RETURNS: + Dictionary with CEL variables. Structure depends on required_columns: + + Without required_columns (all columns added): + { + 'value': 75000, + 'column_name': 'salary', + 'record_index': 0, + 'record': {'emp_id': 1001, 'name': 'John', 'salary': 75000, 'min_salary': 60000}, + 'emp_id': 1001, # All columns added + 'name': 'John', # All columns added + 'salary': 75000, # All columns added + 'min_salary': 60000 # All columns added + } + + With required_columns={'min_salary'}: + { + 'value': 75000, + 'column_name': 'salary', + 'record_index': 0, + 'record': {'emp_id': 1001, 'name': 'John', 'salary': 75000, 'min_salary': 60000}, + 'min_salary': 60000 # Only required column added + } + Note: record dict still has all columns, but only min_salary is + added as a top-level variable for simple syntax access. + + USAGE IN CEL EXPRESSIONS: + After building context, you can use either syntax: + - Simple: 'value > min_salary' (if min_salary in required_columns or None) + - Explicit: 'value > record.min_salary' (always works) + Both work identically! + + EXAMPLES: + Basic usage (all columns): + >>> context = CELContextBuilder.build_context( + ... value=75000, + ... column_name='salary', + ... record=[1001, 75000, 60000], + ... metadata=metadata, + ... record_index=5 + ... ) + >>> # All columns available: min_salary, emp_id, etc. + + Optimized usage (specific columns only): + >>> context = CELContextBuilder.build_context( + ... value=75000, + ... column_name='salary', + ... record=[1001, 75000, 60000], + ... metadata=metadata, + ... record_index=5, + ... required_columns={'min_salary'} + ... ) + >>> # Only min_salary available as top-level variable + >>> # But record.emp_id still works via record dict + """ + # Initialize context with required variables + context = CELContextBuilder._init_base_context(value, column_name, record_index) + + # Add record data and columns + if metadata and record: + CELContextBuilder._add_record_with_metadata( + context, record, metadata, required_columns, bindings + ) + elif record: + CELContextBuilder._add_record_without_metadata(context, record) + else: + context['record'] = {} + + return context + + @staticmethod + def build_table_context( + record: List[Any], + metadata: AssetMetadata, + record_index: int = 0, + required_columns: Optional[set] = None, + bindings: Optional[Dict[str, str]] = None + ) -> Dict[str, Any]: + """ + Build CEL evaluation context for table-level validation. + + WHAT THIS DOES: + Unlike build_context() which validates a single column value, this method + builds context for validating the entire record. It does NOT include + 'value' or 'column_name' variables since we're not focused on a specific column. + + KEY DIFFERENCES FROM build_context(): + - NO 'value' variable (no single column being validated) + - NO 'column_name' variable (validating entire record) + - YES 'record' dict (all columns) + - YES direct column access (e.g., salary, age, department) + - YES 'record_index' (position in batch) + + OPTIMIZATION FOR WIDE TABLES: + When required_columns is provided, only those specific columns are added + directly to the context. This is critical for assets with many columns + (e.g., 100+) to reduce memory usage and improve performance. + + VARIABLE BINDINGS: + When bindings are provided, generic variable names in the expression are + mapped to actual column names for reusable validation templates. + + PARAMETERS: + record: Complete record as array (e.g., [1001, 'John', 75000, 60000]) + + metadata: Column definitions for mapping array positions to names + + record_index: Position of this record in the batch (default: 0) + + required_columns: Set of column names to include in context (optional). + - If None: ALL columns are added directly to context + - If set: ONLY these columns are added directly to context + - The record dict always contains ALL columns regardless + + bindings: Variable name to column name mapping (optional). + Maps generic variable names to actual column names. + Example: {'current': 'salary', 'minimum': 'min_salary'} + + RETURNS: + Dictionary with CEL variables for table-level validation: + { + 'record_index': 0, + 'record': {'emp_id': 1001, 'name': 'John', 'salary': 75000, 'min_salary': 60000}, + 'emp_id': 1001, # Direct column access (if in required_columns or None) + 'name': 'John', # Direct column access (if in required_columns or None) + 'salary': 75000, # Direct column access (if in required_columns or None) + 'min_salary': 60000 # Direct column access (if in required_columns or None) + } + + USAGE IN CEL EXPRESSIONS: + After building context, you can use: + - Simple: 'salary > min_salary && age >= 18' + - Explicit: 'record.salary > record.min_salary && record.age >= 18' + Both work identically! + + EXAMPLES: + Basic usage (all columns): + >>> context = CELContextBuilder.build_table_context( + ... record=[1001, 75000, 60000], + ... metadata=metadata, + ... record_index=5 + ... ) + >>> # All columns available: emp_id, salary, min_salary + + Optimized usage (specific columns only): + >>> context = CELContextBuilder.build_table_context( + ... record=[1001, 75000, 60000], + ... metadata=metadata, + ... record_index=5, + ... required_columns={'salary', 'min_salary'} + ... ) + >>> # Only salary and min_salary available as top-level variables + >>> # But record.emp_id still works via record dict + """ + # Initialize context with record_index + context: Dict[str, Any] = {'record_index': record_index} + + # Add record data and columns + if metadata and record: + CELContextBuilder._add_table_record_with_metadata( + context, record, metadata, required_columns, bindings + ) + else: + context['record'] = {} + + return context + + + @staticmethod + def _build_record_dict( + record: List[Any], + metadata: AssetMetadata + ) -> Any: + """ + Convert record array to CEL-compatible object using metadata. + + This method maps array positions to column names, creating an + object that can be used in CEL expressions like: + 'value > record.min_salary' + + For celpy, we need to use celpy.json_to_cel() to create proper + CEL objects that support field selection. + + Args: + record: Record array with values in metadata column order + metadata: Asset metadata with column definitions + + Returns: + CEL-compatible object (celpy MapType) or dict as fallback + + Example: + >>> record = [1001, 'John', 75000] + >>> record_obj = CELContextBuilder._build_record_dict(record, metadata) + >>> # Can now use: record.emp_id, record.name, record.salary + """ + record_dict = {} + + # Map each column to its value + for idx, column in enumerate(metadata.columns): + if idx < len(record): + record_dict[column.name] = record[idx] + else: + # Column exists in metadata but not in record + # Set to None to avoid KeyError in CEL expressions + record_dict[column.name] = None + + # Convert to CEL-compatible object if celpy is available + # celpy's celtypes.MapType can handle dict-like access + if celpy: + try: + # Use celpy's celtypes to create a proper map + from celpy import celtypes + return celtypes.MapType(record_dict) + except (ImportError, AttributeError, TypeError): + # Fallback to dict if: + # - ImportError: celtypes module not available + # - AttributeError: MapType not found in celtypes + # - TypeError: MapType constructor fails + return record_dict + + return record_dict + + @staticmethod + def validate_context(context: Dict[str, Any]) -> bool: + """ + Validate that context has required fields for CEL evaluation. + + WHAT THIS DOES: + Checks if a context dictionary contains all required variables + before passing it to CEL for evaluation. Prevents runtime errors. + + REQUIRED FIELDS: + - value: The column value being validated + - column_name: Name of the column + - record: Dictionary of all column values + + OPTIONAL FIELDS (not checked): + - record_index: Position in batch + - : Direct column access variables + + PARAMETERS: + context: Dictionary to validate + + RETURNS: + True if all required fields present, False otherwise + + USE CASE: + Use this before CEL evaluation to catch missing variables early + and provide better error messages to users. + + EXAMPLES: + >>> # Valid context + >>> context = {'value': 100, 'column_name': 'age', 'record': {}} + >>> is_valid = CELContextBuilder.validate_context(context) + >>> print(is_valid) # True + >>> + >>> # Invalid context (missing 'record') + >>> incomplete_context = {'value': 100, 'column_name': 'age'} + >>> is_valid = CELContextBuilder.validate_context(incomplete_context) + >>> print(is_valid) # False + """ + # Define minimum required fields for CEL evaluation + required_fields = ['value', 'column_name', 'record'] + + # Check if all required fields are present in context + return all(field in context for field in required_fields) + + @staticmethod + def get_available_variables() -> List[str]: + """ + Get list of core variables available in CEL context. + + WHAT THIS RETURNS: + A list of the standard variables that are always available in + CEL expressions, regardless of the data being validated. + + CORE VARIABLES: + - value: Current column value being validated + - record: Dictionary of all column values + - column_name: Name of the column being validated + - record_index: Position of record in batch + + NOTE: In addition to these core variables, column names are also + available directly (e.g., 'min_salary', 'age') when metadata is + provided. This method only returns the core variables. + + USE CASES: + - Documentation generation + - Error messages showing available variables + - IDE autocomplete suggestions + - Validation of CEL expressions + + RETURNS: + List of core variable names + + EXAMPLE: + >>> variables = CELContextBuilder.get_available_variables() + >>> print(variables) + >>> # ['value', 'record', 'column_name', 'record_index'] + >>> + >>> # Use in error message: + >>> print(f"Available variables: {', '.join(variables)}") + >>> # Available variables: value, record, column_name, record_index + """ + return ['value', 'record', 'column_name', 'record_index'] + +# Made with Bob diff --git a/src/wxdi/dq_validator/cel_exceptions.py b/src/wxdi/dq_validator/cel_exceptions.py new file mode 100644 index 0000000..b5a5559 --- /dev/null +++ b/src/wxdi/dq_validator/cel_exceptions.py @@ -0,0 +1,74 @@ +# Copyright 2026 IBM Corporation +# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See the LICENSE file in the project root for license information. + +""" +Custom exceptions for CEL (Common Expression Language) validation. +""" + + +class CELError(Exception): + """ + Base exception for CEL-related errors. + + This is the parent class for all CEL-specific exceptions. + """ + pass + + +class CELCompilationError(CELError): + """ + Raised when a CEL expression fails to compile. + + This indicates a syntax error in the CEL expression and is raised + during CELCheck initialization (fail-fast approach). + + Examples of compilation errors: + - Invalid syntax: 'value >' + - Undefined variable: 'unknown_var > 0' + - Invalid operator: 'value === 100' + - Mismatched parentheses: 'value > (100' + + Example: + >>> from wxdi.dq_validator import CELCheck + >>> try: + ... check = CELCheck('value >') # Invalid syntax + ... except CELCompilationError as e: + ... print(f"Compilation failed: {e}") + """ + pass + + +class CELEvaluationError(CELError): + """ + Raised when a CEL expression fails during evaluation. + + This indicates a runtime error such as: + - Type mismatch: 'value + "string"' when value is numeric + - Null reference: 'record.missing_field > 0' + - Division by zero: 'value / record.zero_field' + - Invalid operation: 'value.contains(123)' when value is not a string + + Note: This exception is typically caught and converted to a + ValidationError rather than propagated to the caller. + + Example: + >>> from wxdi.dq_validator import CELCheck + >>> check = CELCheck('value + record.missing_field') + >>> # During validation, this will return ValidationError + >>> # rather than raising CELEvaluationError + """ + pass + +# Made with Bob diff --git a/src/wxdi/dq_validator/checks/__init__.py b/src/wxdi/dq_validator/checks/__init__.py index 2c94376..ea71160 100644 --- a/src/wxdi/dq_validator/checks/__init__.py +++ b/src/wxdi/dq_validator/checks/__init__.py @@ -26,6 +26,8 @@ from .regex_check import RegexCheck from .datatype_check import DataTypeCheck from .format_check import FormatCheck, FormatConstraintType +from .cel_check import CELCheck +from .table_cel_check import TableCELCheck __all__ = [ "LengthCheck", @@ -40,5 +42,7 @@ "DataTypeCheck", "FormatCheck", "FormatConstraintType", + "CELCheck", + "TableCELCheck", ] diff --git a/src/wxdi/dq_validator/checks/cel_check.py b/src/wxdi/dq_validator/checks/cel_check.py new file mode 100644 index 0000000..e97ad5e --- /dev/null +++ b/src/wxdi/dq_validator/checks/cel_check.py @@ -0,0 +1,634 @@ +# Copyright 2026 IBM Corporation +# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See the LICENSE file in the project root for license information. + +""" +CEL (Common Expression Language) validation check. + +This module provides the CELCheck class which allows customers to define +custom validation logic using Google's Common Expression Language (CEL). + +Package: cel-python (from cel-expr-python project) +GitHub: https://github.com/cel-expr/cel-python +PyPI: https://pypi.org/project/cel-python/ +""" + +import warnings +from typing import Any, Dict, Optional +from ..base import BaseCheck, ValidationError +from ..data_quality_dimension import DataQualityDimension +from ..cel_context import CELContextBuilder +from ..cel_exceptions import CELCompilationError, CELEvaluationError + +# Import celpy from cel-python package (Google's official CEL implementation) +# Package name: cel-python +# Import name: celpy +# Source: https://github.com/cel-expr/cel-python +try: + import celpy +except ImportError as e: + raise ImportError( + "cel-python is required for CELCheck. " + "This is a Python implementation of Google's Common Expression Language (CEL). " + "Install it with: pip install cel-python>=0.5.0" + ) from e + + +class CELCheck(BaseCheck): + """ + Validates data using CEL (Common Expression Language) expressions. + + OVERVIEW: + CEL is a non-Turing complete expression language designed for safe, + fast evaluation. This check allows customers to define custom validation + logic without modifying code. + + This implementation uses Google's official CEL Python implementation + from the cel-expr-python project (package: cel-python, import: celpy). + + ⚠️ IMPORTANT - CASE SENSITIVITY: + Column names are CASE-SENSITIVE. 'birth_date' and 'Birth_date' are different. + 'firstName' and 'First_Name' are different. Use exact column names from metadata. + + SYNTAX OPTIONS: + CEL expressions support two syntax styles for accessing column values: + + 1. SIMPLE SYNTAX (RECOMMENDED): + - Direct column access: 'min_salary', 'age', 'department' + - More intuitive for clients + - Examples: 'value > min_salary', 'age > 40', 'department == "Sales"' + + 2. EXPLICIT SYNTAX (STILL SUPPORTED): + - Prefixed access: 'record.min_salary', 'record.age', 'record.department' + - Required for columns with reserved names (value, column_name, record_index, record) + - Examples: 'value > record.min_salary', 'record.age > 40' + + Both syntaxes work identically and can be mixed in the same expression. + + AVAILABLE VARIABLES: + ┌─────────────────┬──────────────────────────────────────────────────────┐ + │ Variable │ Description │ + ├─────────────────┼──────────────────────────────────────────────────────┤ + │ value │ Current column value being validated │ + │ column_name │ Name of the column being validated │ + │ record_index │ Position of record in batch (0-based) │ + │ record │ Dict of all columns: {'col1': val1, 'col2': val2} │ + │ │ Direct access to each column (e.g., min_salary) │ + └─────────────────┴──────────────────────────────────────────────────────┘ + + SUPPORTED OPERATORS: + - Comparison: ==, !=, <, <=, >, >= + - Logical: &&, ||, ! + - Arithmetic: +, -, *, /, % + - String: contains, startsWith, endsWith, matches + - List: in, size, all, exists + - Ternary: condition ? true_value : false_value + + EXAMPLES: + Simple value check: + >>> check = CELCheck('value > 0') + + Multi-column comparison (SIMPLE SYNTAX): + >>> check = CELCheck('value > min_salary') + + Multi-column comparison (EXPLICIT SYNTAX): + >>> check = CELCheck('value > record.min_salary') + + Complex business logic with simple syntax: + >>> check = CELCheck( + ... expression='age > 40 ? value >= 80000 : value >= 50000', + ... error_message='Salary does not meet age-based requirements' + ... ) + + String operations: + >>> check = CELCheck('value.endsWith("@company.com")') + + List operations: + >>> check = CELCheck('value in ["Active", "Pending", "Approved"]') + + Department-based validation: + >>> check = CELCheck('department == "Sales" ? value <= 20000 : value <= 10000') + + Arithmetic operations: + >>> check = CELCheck('value >= min_salary * 1.2') + + RESERVED NAMES: + If your data has columns named 'value', 'column_name', 'record_index', or 'record', + you must use explicit syntax: 'record.value' instead of 'value'. + + PERFORMANCE: + CEL expressions are compiled once at initialization and reused for all + validations, providing excellent performance (~10-100 microseconds per record). + """ + + # Maximum expression length to prevent abuse + MAX_EXPRESSION_LENGTH = 1000 + + def __init__( + self, + expression: str, + error_message: Optional[str] = None, + dimension: DataQualityDimension = DataQualityDimension.VALIDITY, + description: Optional[str] = None, + bindings: Optional[Dict[str, str]] = None + ): + """ + Initialize CEL validation check. + + WHAT THIS DOES: + Compiles the CEL expression at initialization (fail-fast approach). + If the expression has syntax errors, CELCompilationError is raised + immediately rather than during validation. + + PARAMETERS: + expression: CEL expression that must evaluate to boolean. + Supports both simple ('min_salary') and explicit ('record.min_salary') syntax. + Available variables: value, column_name, record_index, record, + + With bindings, you can use generic variable names that map to actual columns. + + error_message: Custom error message (optional). + If not provided, generates: "CEL validation failed: " + + dimension: Data quality dimension (default: VALIDITY). + Options: COMPLETENESS, VALIDITY, CONSISTENCY, ACCURACY, etc. + + description: Human-readable description of the check (optional). + If not provided, uses: "CEL: " + + bindings: Variable name to column name mapping (optional). + Allows generic expressions with placeholder variables. + Example: {'current_value': 'salary', 'minimum': 'min_salary'} + Expression: 'current_value > minimum' + Maps to columns: salary > min_salary + + RAISES: + ValueError: If expression is empty, whitespace-only, or exceeds 1000 characters + CELCompilationError: If expression has syntax errors or invalid CEL syntax + + EXAMPLES: + Basic usage: + >>> check = CELCheck('value > 0') + + Simple syntax (recommended): + >>> check = CELCheck('value > min_salary') + + Explicit syntax: + >>> check = CELCheck('value > record.min_salary') + + Complex validation with custom message: + >>> check = CELCheck( + ... expression='age > 40 ? value >= 80000 : value >= 50000', + ... error_message='Salary does not meet age-based requirements', + ... dimension=DataQualityDimension.VALIDITY, + ... description='Age-based salary validation' + ... ) + + String validation: + >>> check = CELCheck( + ... expression='value.endsWith("@company.com")', + ... error_message='Email must be from company domain' + ... ) + + Variable binding (generic expressions): + >>> check = CELCheck( + ... expression='current_value > minimum && person_age >= 18', + ... bindings={ + ... 'current_value': 'salary', + ... 'minimum': 'min_salary', + ... 'person_age': 'age' + ... }, + ... error_message='Salary and age requirements not met' + ... ) + + Reusable template with bindings: + >>> # Same expression, different columns + >>> salary_check = CELCheck( + ... expression='current > minimum', + ... bindings={'current': 'salary', 'minimum': 'min_salary'} + ... ) + >>> bonus_check = CELCheck( + ... expression='current > minimum', + ... bindings={'current': 'bonus', 'minimum': 'min_bonus'} + ... ) + """ + super().__init__(dimension) + + # Validate and set expression + self.expression = self._validate_expression(expression) + + # Set metadata + self.error_message = error_message + self.description = description or f"CEL: {self.expression}" + + # Validate and set bindings + self.bindings = self._validate_bindings(bindings or {}) + + # Compile CEL expression + self._env, self._ast, self._program = self._compile_expression() + + # Extract required columns for optimization + self._required_columns = self._extract_column_references() + + def _validate_expression(self, expression: str) -> str: + """Validate and normalize the CEL expression.""" + if not expression or not expression.strip(): + raise ValueError("CEL expression cannot be empty") + + normalized = expression.strip() + + if len(normalized) > self.MAX_EXPRESSION_LENGTH: + raise ValueError( + f"CEL expression too long: {len(normalized)} characters " + f"(max: {self.MAX_EXPRESSION_LENGTH})" + ) + + return normalized + + def _validate_bindings(self, bindings: Dict[str, str]) -> Dict[str, str]: + """Validate the bindings dictionary.""" + if not bindings: + return {} + + if not isinstance(bindings, dict): + raise ValueError("bindings must be a dictionary") + + for var_name, col_name in bindings.items(): + if not isinstance(var_name, str) or not isinstance(col_name, str): + raise ValueError("binding keys and values must be strings") + if not var_name or not col_name: + raise ValueError("binding keys and values cannot be empty") + + return bindings + + def _compile_expression(self): + """Compile the CEL expression and return environment, AST, and program.""" + try: + env = celpy.Environment() + ast = env.compile(self.expression) + program = env.program(ast) + return env, ast, program + except Exception as e: + raise CELCompilationError( + f"Failed to compile CEL expression '{self.expression}': {str(e)}" + ) from e + + def _extract_column_references(self) -> Optional[set]: + """ + Extract column names referenced in the CEL expression from compiled AST. + + OPTIMIZATION PURPOSE: + For assets with many columns (e.g., 100+ columns), adding all columns + to the CEL context is wasteful. This method attempts to extract only + the columns actually used from the compiled CEL AST. + + FALLBACK STRATEGY: + If AST traversal fails or is unreliable, returns None to indicate + that ALL columns should be included in the context. This ensures + correctness over optimization, especially for non-standard column names. + + EXTRACTION STRATEGY: + 1. Traverse the compiled CEL AST to find variable references + 2. Filter out reserved names (value, column_name, record_index, record) + 3. If traversal fails, return None (use all columns) + + EXAMPLES: + Expression: 'value > min_salary' + Returns: {'min_salary'} + + Expression: 'record.age > 40 ? value >= 80000 : value >= 50000' + Returns: {'age'} + + Expression: 'department == "Sales" && value > min_salary' + Returns: {'department', 'min_salary'} + + If AST traversal fails: + Returns: None (caller should use all columns) + + RETURNS: + Set of column names, or None if all columns should be used + """ + RESERVED = {'value', 'column_name', 'record_index', 'record'} + + try: + required_columns = set() + self._extract_identifiers_from_node(self._ast, required_columns, RESERVED) + return required_columns if required_columns else None + except (AttributeError, TypeError, RuntimeError): + # AST traversal failed - return None to indicate all columns should be used + return None + + def _extract_identifiers_from_node(self, node: Any, columns: set, reserved: set) -> None: + """Helper method to recursively extract identifiers from AST node.""" + if node is None: + return + + # Check if this is an identifier node + if hasattr(node, 'name') and isinstance(node.name, str): + if node.name not in reserved: + columns.add(node.name) + + # Check for select expressions (record.field) + if hasattr(node, 'operand') and hasattr(node, 'field'): + if hasattr(node.field, 'name') and isinstance(node.field.name, str): + columns.add(node.field.name) + + # Recursively process child nodes + self._process_child_nodes(node, columns, reserved) + + def _process_child_nodes(self, node: Any, columns: set, reserved: set) -> None: + """Helper method to process child nodes of an AST node.""" + for attr_name in dir(node): + if attr_name.startswith('_'): + continue + try: + attr = getattr(node, attr_name, None) + if attr is None or callable(attr): + continue + if isinstance(attr, list): + for item in attr: + self._extract_identifiers_from_node(item, columns, reserved) + elif hasattr(attr, '__dict__'): + self._extract_identifiers_from_node(attr, columns, reserved) + except (AttributeError, TypeError): + continue + + def validate_column_references(self, available_columns: list) -> None: + """ + Validate that column references in expression exist in the provided list. + + ⚠️ OPTIONAL VALIDATION: + This method provides early validation of column references. Call it after + initialization if you want to catch column name errors before runtime. + + WHAT THIS DOES: + Checks if columns referenced in the CEL expression exist in the provided + list of available columns. Raises ValueError with helpful error message + if any columns are missing. + + WHEN TO USE: + - After creating CELCheck, before adding to validator + - When you have metadata and want early error detection + - To catch typos or case mismatches before validation runs + + PARAMETERS: + available_columns: List of valid column names (e.g., from metadata.columns) + + RAISES: + ValueError: If expression references columns not in available_columns. + Error message includes: + - List of missing columns + - Case sensitivity reminder + - List of available columns + + EXAMPLES: + Basic usage: + >>> metadata = AssetMetadata(columns=[ + ... ColumnMetadata('birth_date', DataType.DATE), + ... ColumnMetadata('first_name', DataType.STRING) + ... ]) + >>> check = CELCheck('birth_date != null') + >>> check.validate_column_references([c.name for c in metadata.columns]) + >>> # No error - column exists + + Catch case mismatch: + >>> check = CELCheck('Birth_date != null') # Wrong case + >>> check.validate_column_references(['birth_date', 'first_name']) + ValueError: CEL expression references non-existent column(s): + - 'Birth_date' not found + + Note: Column names are CASE-SENSITIVE. + Available columns: 'birth_date', 'first_name' + + Multiple missing columns: + >>> check = CELCheck('Birth_date != null && LastName != null') + >>> check.validate_column_references(['birth_date', 'first_name']) + ValueError: CEL expression references non-existent column(s): + - 'Birth_date' not found + - 'LastName' not found + + Note: Column names are CASE-SENSITIVE. + Available columns: 'birth_date', 'first_name' + """ + if not self._required_columns: + # Could not extract columns from AST - issue warning and skip validation + warnings.warn( + f"Unable to validate column references for CEL expression '{self.expression}'. " + "Column extraction from AST failed. Validation will occur at runtime.", + UserWarning, + stacklevel=2 + ) + return + + if not available_columns: + # No columns provided - skip validation + return + + # Find missing columns + missing = [col for col in self._required_columns if col not in available_columns] + + if missing: + error_parts = [ + "CEL expression references non-existent column(s):" + ] + for col_name in sorted(missing): + error_parts.append(f"\n - '{col_name}' not found") + + error_parts.append( + "\n\n⚠️ Column names are CASE-SENSITIVE. " + "'birth_date' and 'Birth_date' are different." + ) + error_parts.append( + f"\nAvailable columns: {', '.join(repr(c) for c in sorted(available_columns))}" + ) + + raise ValueError(''.join(error_parts)) + + def get_check_name(self) -> str: + """ + Return the name of this check type. + + Returns: + 'cel_check' + """ + return "cel_check" + + def validate( + self, + value: Any, + context: Dict[str, Any] + ) -> Optional[ValidationError]: + """ + Validate value using CEL expression. + + This method: + 1. Builds CEL evaluation context from validation context + 2. Evaluates the compiled CEL expression + 3. Checks that result is boolean + 4. Returns ValidationError if expression evaluates to False + + Args: + value: The value to validate + context: Validation context containing: + - column_name: Name of the column being validated + - record: Full record array + - metadata: AssetMetadata object + - record_index: Record position (optional) + + Returns: + ValidationError if validation fails, None if passes + + Example: + >>> check = CELCheck('value > 100') + >>> context = { + ... 'column_name': 'age', + ... 'record': [1001, 'John', 25], + ... 'metadata': metadata, + ... 'record_index': 0 + ... } + >>> error = check.validate(25, context) + >>> if error: + ... print(error.message) # "CEL validation failed: value > 100" + """ + column_name = context.get('column_name', 'unknown') + + try: + # Build CEL context and evaluate expression + cel_context = self._build_cel_context(value, column_name, context) + result = self._program.evaluate(cel_context) + + # Convert result to boolean and validate + return self._process_evaluation_result(result, value, column_name) + + except CELEvaluationError as e: + return self._create_evaluation_error(column_name, value, str(e)) + except Exception as e: + return self._create_unexpected_error(column_name, value, str(e)) + + def _build_cel_context(self, value: Any, column_name: str, context: Dict[str, Any]) -> Dict[str, Any]: + """Build CEL evaluation context from validation context.""" + return CELContextBuilder.build_context( + value=value, + column_name=column_name, + record=context.get('record'), + metadata=context.get('metadata'), + record_index=context.get('record_index', 0), + required_columns=self._required_columns, + bindings=self.bindings + ) + + def _process_evaluation_result( + self, + result: Any, + value: Any, + column_name: str + ) -> Optional[ValidationError]: + """Process CEL evaluation result and return ValidationError if needed.""" + # Convert celpy BoolType to Python bool + result_bool = self._convert_to_bool(result) + + if result_bool is None: + # Expression didn't return boolean + return ValidationError( + column_name=column_name, + check_name=self.get_check_name(), + message=( + f"CEL expression must return boolean, got {type(result).__name__}. " + f"Expression: '{self.expression}'" + ), + value=value + ) + + # Check validation result + if not result_bool: + error_msg = self.error_message or f"CEL validation failed: {self.expression}" + return ValidationError( + column_name=column_name, + check_name=self.get_check_name(), + message=error_msg, + value=value, + expected=f"Expression '{self.expression}' to be true" + ) + + return None + + def _convert_to_bool(self, result: Any) -> Optional[bool]: + """Convert CEL result to Python bool, or None if not boolean.""" + if hasattr(result, '__bool__'): + return bool(result) + elif isinstance(result, bool): + return result + return None + + def _create_evaluation_error(self, column_name: str, value: Any, error_msg: str) -> ValidationError: + """Create ValidationError for CEL evaluation errors.""" + return ValidationError( + column_name=column_name, + check_name=self.get_check_name(), + message=f"CEL evaluation error: {error_msg}", + value=value + ) + + def _create_unexpected_error(self, column_name: str, value: Any, error_msg: str) -> ValidationError: + """Create ValidationError for unexpected errors.""" + return ValidationError( + column_name=column_name, + check_name=self.get_check_name(), + message=f"Unexpected error in CEL validation: {error_msg}", + value=value + ) + + def __repr__(self) -> str: + """ + String representation of the check. + + Returns: + String showing the CEL expression + + Example: + >>> check = CELCheck('value > 100') + >>> print(check) + CELCheck(expression='value > 100') + """ + return f"CELCheck(expression='{self.expression}')" + + def get_expression(self) -> str: + """ + Get the CEL expression. + + Returns: + The CEL expression string + + Example: + >>> check = CELCheck('value > 100') + >>> print(check.get_expression()) + value > 100 + """ + return self.expression + + def get_description(self) -> str: + """ + Get the check description. + + Returns: + Human-readable description of the check + + Example: + >>> check = CELCheck('value > 100', description='Age must exceed 100') + >>> print(check.get_description()) + Age must exceed 100 + """ + return self.description + +# Made with Bob diff --git a/src/wxdi/dq_validator/checks/table_cel_check.py b/src/wxdi/dq_validator/checks/table_cel_check.py new file mode 100644 index 0000000..8df947f --- /dev/null +++ b/src/wxdi/dq_validator/checks/table_cel_check.py @@ -0,0 +1,340 @@ +# Copyright 2026 IBM Corporation +# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See the LICENSE file in the project root for license information. + +""" +Table-level CEL (Common Expression Language) validation check. + +Unlike CELCheck which validates a single column value, TableCELCheck validates +the entire record for cross-column business logic. +""" + +import warnings +from typing import Any, Optional, Dict, Set +try: + import celpy +except ImportError: + raise ImportError( + "cel-python is required for CEL expression support. " + "Install it with: pip install cel-python>=0.5.0" + ) + +from ..base import BaseCheck, ValidationError +from ..data_quality_dimension import DataQualityDimension +from ..cel_context import CELContextBuilder +from ..cel_exceptions import CELCompilationError, CELEvaluationError + + +class TableCELCheck(BaseCheck): + """ + CEL expression check for table-level validation. + + Unlike CELCheck which validates a single column value, TableCELCheck validates + the entire record. This enables: + - Cross-column validation (e.g., start_date < end_date) + - Complex business rules spanning multiple fields + - Conditional logic based on multiple columns + + ⚠️ IMPORTANT - CASE SENSITIVITY: + Column names are CASE-SENSITIVE. 'birth_date' and 'Birth_date' are different. + 'firstName' and 'First_Name' are different. Use exact column names from metadata. + + Available Variables in CEL Expression: + - Column names: Direct access to any column (e.g., salary, age, department) + - record: Dictionary of all column values (e.g., record.salary, record.age) + - record_index: Position of the record in the batch + + Note: Unlike CELCheck, there is NO 'value' or 'column_name' variable since + we're validating the entire record, not a specific column. + + Example: + >>> from wxdi.dq_validator import TableValidationRule, TableCELCheck + >>> + >>> # Multi-column validation + >>> rule = TableValidationRule('salary_check') + >>> rule.add_check(TableCELCheck( + ... 'salary > min_salary && age >= 18', + ... error_message='Invalid salary/age combination' + ... )) + >>> + >>> # Complex business rules + >>> rule = TableValidationRule('department_rules') + >>> rule.add_check(TableCELCheck( + ... 'department == "Sales" ? salary >= 50000 : salary >= 40000', + ... error_message='Salary does not meet department requirements' + ... )) + >>> + >>> # Cross-column consistency + >>> rule = TableValidationRule('date_check') + >>> rule.add_check(TableCELCheck( + ... 'start_date < end_date', + ... error_message='Start date must be before end date' + ... )) + """ + + def __init__( + self, + expression: str, + error_message: Optional[str] = None, + dimension: DataQualityDimension = DataQualityDimension.VALIDITY, + bindings: Optional[Dict[str, str]] = None + ): + """ + Initialize table-level CEL check. + + Args: + expression: CEL expression that evaluates to boolean + Can reference any column by name (e.g., 'salary > min_salary') + With bindings, can use generic variable names + error_message: Custom error message (optional) + dimension: Data quality dimension (default: VALIDITY) + bindings: Variable name to column name mapping (optional) + Example: {'current': 'salary', 'minimum': 'min_salary'} + Expression: 'current > minimum' maps to 'salary > min_salary' + + Raises: + CELCompilationError: If expression cannot be compiled + + Example: + >>> check = TableCELCheck( + ... 'salary > min_salary && age >= 18', + ... error_message='Invalid salary/age combination' + ... ) + + >>> # With bindings + >>> check = TableCELCheck( + ... 'current > minimum && person_age >= 18', + ... bindings={'current': 'salary', 'minimum': 'min_salary', 'person_age': 'age'}, + ... error_message='Invalid salary/age combination' + ... ) + """ + super().__init__(dimension) + self.expression = expression + self.error_message = error_message or f"Table-level CEL check failed: {expression}" + + # Validate and set bindings + self.bindings = self._validate_bindings(bindings or {}) + + # Compile CEL expression + self._ast, self._program = self._compile_expression(expression) + + # Extract required columns for performance optimization + self._required_columns = self._extract_column_references() + + def _validate_bindings(self, bindings: Dict[str, str]) -> Dict[str, str]: + """Validate the bindings dictionary.""" + if not bindings: + return {} + + if not isinstance(bindings, dict): + raise ValueError("bindings must be a dictionary") + + for var_name, col_name in bindings.items(): + if not isinstance(var_name, str) or not isinstance(col_name, str): + raise ValueError("binding keys and values must be strings") + if not var_name or not col_name: + raise ValueError("binding keys and values cannot be empty") + + return bindings + + def _compile_expression(self, expression: str): + """Compile the CEL expression and return AST and program.""" + try: + env = celpy.Environment() + ast = env.compile(expression) + program = env.program(ast) + return ast, program + except Exception as e: + raise CELCompilationError( + f"Failed to compile CEL expression '{expression}': {e}" + ) + + def _extract_column_references(self) -> Optional[Set[str]]: + """ + Extract column names referenced in the CEL expression from compiled AST. + + This enables performance optimization by only adding required columns + to the CEL context, which is critical for wide tables (100+ columns). + + Returns: + Set of column names, or None if extraction fails (safe fallback) + """ + RESERVED = {'record', 'record_index', 'true', 'false', 'null'} + + try: + required_columns = set() + self._extract_identifiers_from_node(self._ast, required_columns, RESERVED) + return required_columns if required_columns else None + except Exception: + # AST traversal failed - return None to indicate all columns should be used + return None + + def _extract_identifiers_from_node(self, node: Any, columns: set, reserved: set) -> None: + """Helper method to recursively extract identifiers from AST node.""" + if node is None: + return + + # Check if this is an identifier node + if hasattr(node, 'name') and isinstance(node.name, str): + if node.name not in reserved: + columns.add(node.name) + + # Check for select expressions (record.field) + if hasattr(node, 'operand') and hasattr(node, 'field'): + if hasattr(node.field, 'name') and isinstance(node.field.name, str): + columns.add(node.field.name) + + # Recursively process child nodes + self._process_child_nodes(node, columns, reserved) + + def _process_child_nodes(self, node: Any, columns: set, reserved: set) -> None: + """Helper method to process child nodes of an AST node.""" + for attr_name in dir(node): + if attr_name.startswith('_'): + continue + try: + attr = getattr(node, attr_name, None) + if attr is None or callable(attr): + continue + if isinstance(attr, list): + for item in attr: + self._extract_identifiers_from_node(item, columns, reserved) + elif hasattr(attr, '__dict__'): + self._extract_identifiers_from_node(attr, columns, reserved) + except (AttributeError, TypeError): + # Ignore errors during AST traversal + continue + + def validate(self, value: Any, context: Dict[str, Any]) -> Optional[ValidationError]: + """ + Validate entire record using CEL expression. + + Args: + value: Ignored for table-level checks (always None) + context: Must contain 'record', 'metadata', and 'rule_name' + + Returns: + ValidationError if validation fails, None if passes + + Raises: + ValueError: If required context keys are missing + CELEvaluationError: If CEL evaluation fails + """ + record = context.get('record') + metadata = context.get('metadata') + rule_name = context.get('rule_name', 'table_rule') + record_index = context.get('record_index', 0) + + if record is None or metadata is None: + raise ValueError( + "Table-level CEL check requires 'record' and 'metadata' in context" + ) + + # Build CEL context (no 'value', only record columns) + # Apply bindings if provided to map generic variable names to actual columns + cel_context = CELContextBuilder.build_table_context( + record=record, + metadata=metadata, + record_index=record_index, + required_columns=self._required_columns, + bindings=self.bindings + ) + + # Evaluate CEL expression + try: + result = self._program.evaluate(cel_context) + + # Convert CEL result to Python bool + # CEL returns BoolType, IntType, etc., not native Python types + try: + result_bool = bool(result) + except (TypeError, ValueError) as e: + # Catch specific exceptions when converting to bool + raise CELEvaluationError( + f"CEL expression must return boolean-compatible value, got {type(result).__name__}: {result}" + ) from e + + # If expression evaluates to False, validation failed + if not result_bool: + return ValidationError( + column_name=rule_name, # Use rule name instead of column + check_name=self.get_check_name(), + message=self.error_message, + value=record + ) + + return None + + except CELEvaluationError: + raise + except Exception as e: + raise CELEvaluationError( + f"CEL evaluation failed for expression '{self.expression}': {e}" + ) + + def get_check_name(self) -> str: + """Return the name of this check type""" + return "table_cel_check" + + def validate_column_references(self, available_columns: list) -> None: + """ + Validate that all column references in the expression exist in metadata. + + This is an optional validation step that can be called after initialization + to catch column name errors early (before runtime evaluation). + + Args: + available_columns: List of valid column names from metadata + + Raises: + ValueError: If expression references non-existent columns + + Example: + >>> metadata = AssetMetadata(columns=[ + ... ColumnMetadata('salary', DataType.DECIMAL), + ... ColumnMetadata('age', DataType.INTEGER) + ... ]) + >>> check = TableCELCheck('salary > min_salary') + >>> check.validate_column_references([c.name for c in metadata.columns]) + ValueError: CEL expression references non-existent column(s): 'min_salary' + """ + if self._required_columns is None: + # Could not extract columns from AST - issue warning and skip validation + warnings.warn( + f"Unable to validate column references for table CEL expression '{self.expression}'. " + "Column extraction from AST failed. Validation will occur at runtime.", + UserWarning, + stacklevel=2 + ) + return + + available_set = set(available_columns) + missing_columns = self._required_columns - available_set + + if missing_columns: + missing_list = "', '".join(sorted(missing_columns)) + available_list = "', '".join(sorted(available_columns)) + + raise ValueError( + f"CEL expression references non-existent column(s):\n" + f" - '{missing_list}' not found\n" + f"\n" + f"⚠️ Column names are CASE-SENSITIVE.\n" + f"Available columns: '{available_list}'" + ) + + def __repr__(self) -> str: + return f"TableCELCheck(expression='{self.expression}')" + +# Made with Bob diff --git a/src/wxdi/dq_validator/table_rule.py b/src/wxdi/dq_validator/table_rule.py new file mode 100644 index 0000000..6bea78a --- /dev/null +++ b/src/wxdi/dq_validator/table_rule.py @@ -0,0 +1,124 @@ +# Copyright 2026 IBM Corporation +# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See the LICENSE file in the project root for license information. + +""" +Table-level validation rule classes. + +Unlike column-level ValidationRule which validates individual column values, +TableValidationRule validates entire records/rows for cross-column business logic. +""" + +from typing import List, Any +from .base import BaseCheck, ValidationError +from .metadata import AssetMetadata + + +class TableValidationRule: + """ + Validation rules for entire table records. + + Unlike ValidationRule which is tied to a specific column, TableValidationRule + validates the entire record. This is useful for: + - Cross-column validation (e.g., start_date < end_date) + - Complex business rules spanning multiple fields + - Conditional logic based on multiple columns + + Example: + >>> from wxdi.dq_validator import TableValidationRule, TableCELCheck + >>> + >>> # Multi-column validation + >>> rule = TableValidationRule('salary_age_check') + >>> rule.add_check(TableCELCheck( + ... 'salary > min_salary && age >= 18', + ... error_message='Invalid salary/age combination' + ... )) + >>> + >>> # Complex business rules + >>> rule = TableValidationRule('department_rules') + >>> rule.add_check(TableCELCheck( + ... 'department == "Sales" ? salary >= 50000 : salary >= 40000', + ... error_message='Salary does not meet department requirements' + ... )) + """ + + def __init__(self, rule_name: str = "table_rule"): + """ + Initialize table-level validation rule. + + Args: + rule_name: Name/description of this rule (used in error messages) + """ + self.rule_name = rule_name + self.checks: List[BaseCheck] = [] + + def add_check(self, check: BaseCheck) -> 'TableValidationRule': + """ + Add a validation check (fluent API). + + Args: + check: The check to add (typically TableCELCheck) + + Returns: + Self for method chaining + + Example: + >>> rule = TableValidationRule('business_rules') + >>> rule.add_check(TableCELCheck('salary > 0')) + >>> rule.add_check(TableCELCheck('age >= 18')) + """ + self.checks.append(check) + return self + + def validate( + self, + record: List[Any], + metadata: AssetMetadata, + record_index: int = 0 + ) -> List[ValidationError]: + """ + Validate the entire record. + + Args: + record: The record array to validate + metadata: Asset metadata for column mapping + record_index: Position of the record in the batch (for context) + + Returns: + List of validation errors (empty if all checks pass) + """ + errors = [] + + # Build context for table-level checks + # Note: No 'value' or 'column_name' since we're validating entire record + context = { + 'record': record, + 'metadata': metadata, + 'column_name': None, # No specific column + 'rule_name': self.rule_name, + 'record_index': record_index + } + + # Run all checks (value=None for table-level) + for check in self.checks: + error = check.validate(None, context) + if error: + errors.append(error) + + return errors + + def __repr__(self) -> str: + return f"TableValidationRule(name='{self.rule_name}', checks={len(self.checks)})" + +# Made with Bob diff --git a/src/wxdi/dq_validator/validator.py b/src/wxdi/dq_validator/validator.py index cce68c6..f74b8a2 100644 --- a/src/wxdi/dq_validator/validator.py +++ b/src/wxdi/dq_validator/validator.py @@ -20,38 +20,94 @@ from typing import List, Any from .metadata import AssetMetadata from .rule import ValidationRule +from .table_rule import TableValidationRule from .result import ValidationResult class Validator: - """Main validator for data quality checks""" + """ + Main validator for data quality checks. + + Supports both column-level and table-level validation rules: + - Column-level rules: Validate individual column values + - Table-level rules: Validate entire records with cross-column logic + + Example: + >>> from wxdi.dq_validator import ( + ... Validator, ValidationRule, TableValidationRule, + ... LengthCheck, TableCELCheck + ... ) + >>> + >>> validator = Validator(metadata) + >>> + >>> # Column-level rule + >>> validator.add_rule( + ... ValidationRule('name') + ... .add_check(LengthCheck(min_length=2)) + ... ) + >>> + >>> # Table-level rule + >>> validator.add_table_rule( + ... TableValidationRule('business_rules') + ... .add_check(TableCELCheck('salary > min_salary && age >= 18')) + ... ) + """ def __init__(self, metadata: AssetMetadata): """ - Initialize validator + Initialize validator. Args: metadata: Asset metadata defining table structure """ self.metadata = metadata self.rules: List[ValidationRule] = [] + self.table_rules: List[TableValidationRule] = [] def add_rule(self, rule: ValidationRule) -> 'Validator': """ - Add a validation rule (fluent API) + Add a column-level validation rule (fluent API). Args: rule: The validation rule to add Returns: Self for method chaining + + Example: + >>> validator.add_rule( + ... ValidationRule('email') + ... .add_check(FormatCheck('email')) + ... ) """ self.rules.append(rule) return self + def add_table_rule(self, rule: TableValidationRule) -> 'Validator': + """ + Add a table-level validation rule (fluent API). + + Table-level rules validate entire records, enabling cross-column + validation and complex business logic. + + Args: + rule: The table validation rule to add + + Returns: + Self for method chaining + + Example: + >>> validator.add_table_rule( + ... TableValidationRule('salary_check') + ... .add_check(TableCELCheck('salary > min_salary')) + ... ) + """ + self.table_rules.append(rule) + return self + def validate(self, record: List[Any], record_index: int = 0) -> ValidationResult: """ - Validate a single record + Validate a single record using both column-level and table-level rules. Args: record: The record array to validate @@ -62,10 +118,13 @@ def validate(self, record: List[Any], record_index: int = 0) -> ValidationResult """ result = ValidationResult(record, record_index) - # Count total checks - result.total_checks = sum(len(rule.checks) for rule in self.rules) + # Count total checks (column-level + table-level) + result.total_checks = ( + sum(len(rule.checks) for rule in self.rules) + + sum(len(rule.checks) for rule in self.table_rules) + ) - # Validate each rule + # Validate column-level rules for rule in self.rules: errors = rule.validate(record, self.metadata) @@ -80,6 +139,21 @@ def validate(self, record: List[Any], record_index: int = 0) -> ValidationResult for error in errors: result.add_error(error) + # Validate table-level rules + for rule in self.table_rules: + errors = rule.validate(record, self.metadata, record_index) + + # Track passed/failed checks + checks_in_rule = len(rule.checks) + failed_in_rule = len(errors) + passed_in_rule = checks_in_rule - failed_in_rule + + result.passed_checks += passed_in_rule + + # Add errors + for error in errors: + result.add_error(error) + return result def validate_batch(self, records: List[List[Any]]) -> List[ValidationResult]: @@ -98,5 +172,9 @@ def validate_batch(self, records: List[List[Any]]) -> List[ValidationResult]: ] def __repr__(self) -> str: - return f"Validator(table='{self.metadata.table_name}', rules={len(self.rules)})" + return ( + f"Validator(table='{self.metadata.table_name}', " + f"column_rules={len(self.rules)}, " + f"table_rules={len(self.table_rules)})" + ) diff --git a/src/wxdi/odcs_generator/README-GENERATE-ODCS-SCRIPT.md b/src/wxdi/odcs_generator/README-GENERATE-ODCS-SCRIPT.md index 9491007..3a04ab4 100644 --- a/src/wxdi/odcs_generator/README-GENERATE-ODCS-SCRIPT.md +++ b/src/wxdi/odcs_generator/README-GENERATE-ODCS-SCRIPT.md @@ -80,8 +80,8 @@ pip install -r requirements.txt 1. Clone the repository: ```bash - git clone - cd data-product-python-sdk + git clone https://github.com/IBM/data-intelligence-sdk.git + cd data-intelligence-sdk ``` 2. Install dependencies: @@ -133,15 +133,15 @@ The user account needs the following permissions: ### Command-Line Usage -Run the script directly from the `odcs_generator` directory: +Run the script as an installed module from the project root or your active environment: ```bash -python odcs_generator/generate_odcs_from_collibra.py +python -m wxdi.odcs_generator.generate_odcs_from_collibra ``` **Example:** ```bash -python odcs_generator/generate_odcs_from_collibra.py 019a57f9-62d2-7aa0-9f22-4fa2cea1180b +python -m wxdi.odcs_generator.generate_odcs_from_collibra 019a57f9-62d2-7aa0-9f22-4fa2cea1180b ``` This generates a file named `-odcs.yaml` in the current directory. @@ -151,7 +151,7 @@ This generates a file named `-odcs.yaml` in the current directory. Import and use the module in your Python code: ```python -from odcs_generator import CollibraClient, ODCSGenerator +from wxdi.odcs_generator import CollibraClient, ODCSGenerator # Initialize client client = CollibraClient( @@ -1069,18 +1069,18 @@ Generated YAML files can be validated using standard YAML validators and ODCS sc ## Project Structure -This script is part of the `data-product-python-sdk` project: +This script is part of the `data-intelligence-sdk` project: ``` -data-product-python-sdk/ -├── dph_services/ # Data Product Hub services -├── odcs_generator/ # ODCS generator module (this script) +data-intelligence-sdk/ +├── src/ +│ └── wxdi/ +│ ├── dph_services/ # Data Product Hub services +│ └── odcs_generator/ # ODCS generator module ├── examples/ # Usage examples -├── test/ # Test suites -│ ├── integration/ # Integration tests -│ └── unit/ # Unit tests -├── requirements.txt # Python dependencies -└── setup.py # Package setup +├── tests/ # Test suites +├── requirements.txt # Python dependencies +└── setup.py # Package setup ``` ## Support diff --git a/src/wxdi/odcs_generator/generate_odcs_from_collibra.py b/src/wxdi/odcs_generator/generate_odcs_from_collibra.py index 46e9470..8f0a0d7 100644 --- a/src/wxdi/odcs_generator/generate_odcs_from_collibra.py +++ b/src/wxdi/odcs_generator/generate_odcs_from_collibra.py @@ -21,8 +21,8 @@ This script fetches asset metadata from Collibra and generates an ODCS v3 compliant YAML file. Usage: - python generate_odcs_from_collibra.py - python generate_odcs_from_collibra.py 019a57f9-62d2-7aa0-9f22-4fa2cea1180b + python -m wxdi.odcs_generator.generate_odcs_from_collibra + python -m wxdi.odcs_generator.generate_odcs_from_collibra 019a57f9-62d2-7aa0-9f22-4fa2cea1180b Environment Variables: COLLIBRA_URL: Collibra instance URL (required) @@ -563,12 +563,11 @@ def _build_physical_type(self, attr_map: Dict[str, Any]) -> Optional[str]: # Build type string with parameters if scale is not None and precision is not None: return f"{base_type}({precision},{scale})" - elif precision is not None: + if precision is not None: return f"{base_type}({precision})" - elif size is not None: + if size is not None: return f"{base_type}({size})" - else: - return base_type + return base_type @staticmethod def _to_int(value: Any) -> Optional[int]: @@ -687,9 +686,9 @@ def _add_inline_comment_if_needed(line: str) -> str: """Add inline comment to server configuration fields if needed""" if ' server:' in line and 'CONFIGURE_SERVER_HOSTNAME' in line: return line + ' # ⚠️ UPDATE: e.g., prod.snowflake.acme.com' - elif ' type:' in line and 'DEFINE_SERVER_TYPE' in line: + if ' type:' in line and 'DEFINE_SERVER_TYPE' in line: return line + ' # ⚠️ UPDATE: e.g., snowflake, postgres, bigquery, redshift' - elif ' schema:' in line and 'CONFIGURE_SCHEMA_NAME' in line: + if ' schema:' in line and 'CONFIGURE_SCHEMA_NAME' in line: return line + ' # ⚠️ UPDATE: e.g., public, dbo, my_schema' return line diff --git a/src/wxdi/odcs_generator/generate_odcs_from_informatica.py b/src/wxdi/odcs_generator/generate_odcs_from_informatica.py index a5314c8..4cc309a 100644 --- a/src/wxdi/odcs_generator/generate_odcs_from_informatica.py +++ b/src/wxdi/odcs_generator/generate_odcs_from_informatica.py @@ -21,8 +21,8 @@ This script fetches asset metadata from Informatica and generates an ODCS v3 compliant YAML file. Usage: - python odcs_generator/generate_odcs_from_informatica.py - python odcs_generator/generate_odcs_from_informatica.py 1b5fc805-252d-4ba2-bd90-e943103e411b --cdgc-url https://cdgc.dm-us.informaticacloud.com -u username -p password + python -m wxdi.odcs_generator.generate_odcs_from_informatica + python -m wxdi.odcs_generator.generate_odcs_from_informatica 1b5fc805-252d-4ba2-bd90-e943103e411b --cdgc-url https://cdgc.dm-us.informaticacloud.com -u username -p password Environment Variables: INFORMATICA_CDGC_URL: Informatica CDGC URL (required, e.g., https://cdgc.dm-us.informaticacloud.com) @@ -72,6 +72,7 @@ } class InformaticaClient: + """Client for interacting with Informatica CDGC API.""" CONTENT_TYPE_JSON = "application/json" HEADERS_JSON = {"Accept": CONTENT_TYPE_JSON} @@ -169,6 +170,7 @@ def get_column_details(self, column_id: str) -> Dict[str, Any]: return self._fetch_asset(column_id) def parse_arguments(): + """Parse command line arguments.""" parser = argparse.ArgumentParser( description='Generate ODCS YAML file from Informatica asset', formatter_class=argparse.RawDescriptionHelpFormatter, @@ -183,6 +185,7 @@ def parse_arguments(): return parser.parse_args() def validate_arguments(args): + """Validate required command line arguments.""" if not args.cdgc_url: print("Error: Informatica CDGC URL is required. Set INFORMATICA_CDGC_URL environment variable or use --cdgc-url") print("Example: --cdgc-url https://cdgc.dm-us.informaticacloud.com") @@ -361,9 +364,9 @@ def _add_inline_comment_if_needed(line: str) -> str: """Add inline comment to server configuration fields if needed""" if ' server:' in line and 'CONFIGURE_SERVER_HOSTNAME' in line: return line + ' # ⚠️ UPDATE: e.g., prod.snowflake.acme.com' - elif ' type:' in line and 'CONFIGURE_SERVER_TYPE' in line: + if ' type:' in line and 'CONFIGURE_SERVER_TYPE' in line: return line + ' # ⚠️ UPDATE: e.g., snowflake, postgres, bigquery, redshift' - elif ' schema:' in line and 'CONFIGURE_SCHEMA_NAME' in line: + if ' schema:' in line and 'CONFIGURE_SCHEMA_NAME' in line: return line + ' # ⚠️ UPDATE: e.g., public, dbo, my_schema' return line @@ -418,6 +421,7 @@ def write_yaml_file(output_file: str, odcs_data: Dict[str, Any]) -> None: def main(): + """Main entry point for the ODCS generator from Informatica.""" args = parse_arguments() validate_arguments(args) diff --git a/src/wxdi/version.py b/src/wxdi/version.py index 52a54fa..12d1cc7 100644 --- a/src/wxdi/version.py +++ b/src/wxdi/version.py @@ -16,4 +16,4 @@ """ Version of IBM watsonx.data intelligence SDK """ -__version__ = '2.0.0' \ No newline at end of file +__version__ = '2.1.0-rc.1' \ No newline at end of file diff --git a/tests/src/dq_validator/test_cel_check.py b/tests/src/dq_validator/test_cel_check.py new file mode 100644 index 0000000..398824f --- /dev/null +++ b/tests/src/dq_validator/test_cel_check.py @@ -0,0 +1,1115 @@ +""" + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import pytest +from wxdi.dq_validator.checks.cel_check import CELCheck +from wxdi.dq_validator.data_quality_dimension import DataQualityDimension +from wxdi.dq_validator.cel_exceptions import CELCompilationError +from wxdi.dq_validator.metadata import AssetMetadata, ColumnMetadata, DataType + + +class TestCELCheckInitialization: + """Tests for CELCheck initialization and compilation""" + + def test_init_simple_expression(self): + """Test initialization with simple expression""" + check = CELCheck('value > 0') + assert check.expression == 'value > 0' + assert check.get_check_name() == 'cel_check' + + def test_init_with_error_message(self): + """Test initialization with custom error message""" + check = CELCheck('value > 100', error_message='Value must exceed 100') + assert check.error_message == 'Value must exceed 100' + + def test_init_with_dimension(self): + """Test initialization with custom dimension""" + check = CELCheck('value > 0', dimension=DataQualityDimension.CONSISTENCY) + assert check.get_dimension() == DataQualityDimension.CONSISTENCY + + def test_init_with_description(self): + """Test initialization with custom description""" + check = CELCheck('value > 0', description='Positive value check') + assert check.description == 'Positive value check' + + def test_init_default_description(self): + """Test default description uses expression""" + check = CELCheck('value > 0') + assert check.description == 'CEL: value > 0' + + def test_init_empty_expression_raises_error(self): + """Test that empty expression raises ValueError""" + with pytest.raises(ValueError, match="CEL expression cannot be empty"): + CELCheck('') + + def test_init_whitespace_expression_raises_error(self): + """Test that whitespace-only expression raises ValueError""" + with pytest.raises(ValueError, match="CEL expression cannot be empty"): + CELCheck(' ') + + def test_init_too_long_expression_raises_error(self): + """Test that expression exceeding max length raises ValueError""" + long_expr = 'value > 0' + ' && value > 0' * 100 # Create very long expression + with pytest.raises(ValueError, match="CEL expression too long"): + CELCheck(long_expr) + + def test_init_invalid_syntax_raises_compilation_error(self): + """Test that invalid CEL syntax raises CELCompilationError""" + with pytest.raises(CELCompilationError): + CELCheck('value >') # Incomplete expression + + def test_init_strips_whitespace(self): + """Test that expression whitespace is stripped""" + check = CELCheck(' value > 0 ') + assert check.expression == 'value > 0' + + def test_get_expression(self): + """Test get_expression returns the expression""" + check = CELCheck('value > 100') + assert check.get_expression() == 'value > 100' + + def test_get_description(self): + """Test get_description returns the description""" + check = CELCheck('value > 0', description='Test description') + assert check.get_description() == 'Test description' + + def test_repr(self): + """Test string representation""" + check = CELCheck('value > 0') + assert repr(check) == "CELCheck(expression='value > 0')" + + +class TestCELCheckSimpleValidation: + """Tests for simple CEL expression validation""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='test_table', + columns=[ + ColumnMetadata('id', DataType.INTEGER), + ColumnMetadata('value', DataType.DECIMAL) + ] + ) + + def test_validate_simple_greater_than_pass(self, metadata): + """Test simple > comparison that passes""" + check = CELCheck('value > 0') + context = { + 'column_name': 'value', + 'record': [1, 100], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(100, context) + assert error is None + + def test_validate_simple_greater_than_fail(self, metadata): + """Test simple > comparison that fails""" + check = CELCheck('value > 0', error_message='Must be positive') + context = { + 'column_name': 'value', + 'record': [1, -50], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(-50, context) + assert error is not None + assert error.message == 'Must be positive' + assert error.column_name == 'value' + + def test_validate_equality_pass(self, metadata): + """Test equality comparison that passes""" + check = CELCheck('value == 100') + context = { + 'column_name': 'value', + 'record': [1, 100], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(100, context) + assert error is None + + def test_validate_equality_fail(self, metadata): + """Test equality comparison that fails""" + check = CELCheck('value == 100') + context = { + 'column_name': 'value', + 'record': [1, 50], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(50, context) + assert error is not None + + def test_validate_less_than_or_equal(self, metadata): + """Test <= comparison""" + check = CELCheck('value <= 100') + context = { + 'column_name': 'value', + 'record': [1, 100], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for 100 + assert check.validate(100, context) is None + # Should pass for 50 + assert check.validate(50, context) is None + # Should fail for 101 + assert check.validate(101, context) is not None + + def test_validate_not_equal(self, metadata): + """Test != comparison""" + check = CELCheck('value != 0') + context = { + 'column_name': 'value', + 'record': [1, 100], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for non-zero + assert check.validate(100, context) is None + # Should fail for zero + assert check.validate(0, context) is not None + + +class TestCELCheckMultiColumnValidation: + """Tests for CEL expressions using multiple columns""" + + @pytest.fixture + def metadata(self): + """Create test metadata with multiple columns""" + return AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL), + ColumnMetadata('max_salary', DataType.DECIMAL) + ] + ) + + def test_validate_record_comparison_pass(self, metadata): + """Test comparison with another column that passes""" + check = CELCheck('value > record.min_salary') + context = { + 'column_name': 'salary', + 'record': [1001, 75000, 60000, 100000], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(75000, context) + assert error is None + + def test_validate_record_comparison_fail(self, metadata): + """Test comparison with another column that fails""" + check = CELCheck('value > record.min_salary', error_message='Below minimum') + context = { + 'column_name': 'salary', + 'record': [1001, 50000, 60000, 100000], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(50000, context) + assert error is not None + assert error.message == 'Below minimum' + + def test_validate_between_two_columns(self, metadata): + """Test value between two columns""" + check = CELCheck('value >= record.min_salary && value <= record.max_salary') + context = { + 'column_name': 'salary', + 'record': [1001, 75000, 60000, 100000], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for value in range + assert check.validate(75000, context) is None + # Should fail for value below range + assert check.validate(50000, context) is not None + # Should fail for value above range + assert check.validate(110000, context) is not None + + +class TestCELCheckConditionalLogic: + """Tests for CEL expressions with conditional (ternary) logic""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('age', DataType.INTEGER), + ColumnMetadata('salary', DataType.DECIMAL) + ] + ) + + def test_validate_ternary_senior_pass(self, metadata): + """Test ternary expression for senior employee (passes)""" + check = CELCheck('record.age > 40 ? value >= 80000 : value >= 50000') + context = { + 'column_name': 'salary', + 'record': [1001, 45, 85000], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(85000, context) + assert error is None + + def test_validate_ternary_senior_fail(self, metadata): + """Test ternary expression for senior employee (fails)""" + check = CELCheck('record.age > 40 ? value >= 80000 : value >= 50000') + context = { + 'column_name': 'salary', + 'record': [1001, 45, 70000], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(70000, context) + assert error is not None + + def test_validate_ternary_junior_pass(self, metadata): + """Test ternary expression for junior employee (passes)""" + check = CELCheck('record.age > 40 ? value >= 80000 : value >= 50000') + context = { + 'column_name': 'salary', + 'record': [1001, 30, 60000], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(60000, context) + assert error is None + + def test_validate_ternary_junior_fail(self, metadata): + """Test ternary expression for junior employee (fails)""" + check = CELCheck('record.age > 40 ? value >= 80000 : value >= 50000') + context = { + 'column_name': 'salary', + 'record': [1001, 30, 40000], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(40000, context) + assert error is not None + + +class TestCELCheckStringOperations: + """Tests for CEL string operations""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='users', + columns=[ + ColumnMetadata('user_id', DataType.INTEGER), + ColumnMetadata('email', DataType.STRING) + ] + ) + + def test_validate_ends_with_pass(self, metadata): + """Test endsWith string operation that passes""" + check = CELCheck('value.endsWith("@company.com")') + context = { + 'column_name': 'email', + 'record': [1, 'john@company.com'], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate('john@company.com', context) + assert error is None + + def test_validate_ends_with_fail(self, metadata): + """Test endsWith string operation that fails""" + check = CELCheck('value.endsWith("@company.com")', error_message='Invalid domain') + context = { + 'column_name': 'email', + 'record': [1, 'john@other.com'], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate('john@other.com', context) + assert error is not None + assert error.message == 'Invalid domain' + + def test_validate_starts_with(self, metadata): + """Test startsWith string operation""" + check = CELCheck('value.startsWith("admin_")') + context = { + 'column_name': 'email', + 'record': [1, 'admin_user@company.com'], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass + assert check.validate('admin_user@company.com', context) is None + # Should fail + assert check.validate('user@company.com', context) is not None + + def test_validate_contains(self, metadata): + """Test string contains using 'in' operator""" + check = CELCheck('"@" in value') + context = { + 'column_name': 'email', + 'record': [1, 'john@company.com'], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for valid email + assert check.validate('john@company.com', context) is None + # Should fail for invalid email + assert check.validate('invalid-email', context) is not None + + +class TestCELCheckListOperations: + """Tests for CEL list operations""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('status', DataType.STRING) + ] + ) + + def test_validate_in_list_pass(self, metadata): + """Test 'in' list operation that passes""" + check = CELCheck('value in ["Active", "Pending", "Approved"]') + context = { + 'column_name': 'status', + 'record': [1, 'Active'], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate('Active', context) + assert error is None + + def test_validate_in_list_fail(self, metadata): + """Test 'in' list operation that fails""" + check = CELCheck('value in ["Active", "Pending", "Approved"]') + context = { + 'column_name': 'status', + 'record': [1, 'Inactive'], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate('Inactive', context) + assert error is not None + + def test_validate_not_in_list(self, metadata): + """Test negated 'in' list operation""" + check = CELCheck('!(value in ["Deleted", "Archived"])') + context = { + 'column_name': 'status', + 'record': [1, 'Active'], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for Active + assert check.validate('Active', context) is None + # Should fail for Deleted + assert check.validate('Deleted', context) is not None + + +class TestCELCheckLogicalOperators: + """Tests for CEL logical operators""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('id', DataType.INTEGER), + ColumnMetadata('value', DataType.DECIMAL) + ] + ) + + def test_validate_and_operator(self, metadata): + """Test && (AND) operator""" + check = CELCheck('value > 0 && value < 100') + context = { + 'column_name': 'value', + 'record': [1, 50], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for value in range + assert check.validate(50, context) is None + # Should fail for negative + assert check.validate(-10, context) is not None + # Should fail for too large + assert check.validate(150, context) is not None + + def test_validate_or_operator(self, metadata): + """Test || (OR) operator""" + check = CELCheck('value < 0 || value > 100') + context = { + 'column_name': 'value', + 'record': [1, 50], + 'metadata': metadata, + 'record_index': 0 + } + # Should fail for value in middle range + assert check.validate(50, context) is not None + # Should pass for negative + assert check.validate(-10, context) is None + # Should pass for large + assert check.validate(150, context) is None + + def test_validate_not_operator(self, metadata): + """Test ! (NOT) operator""" + check = CELCheck('!(value == 0)') + context = { + 'column_name': 'value', + 'record': [1, 50], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for non-zero + assert check.validate(50, context) is None + # Should fail for zero + assert check.validate(0, context) is not None + + +class TestCELCheckEdgeCases: + """Tests for edge cases and error handling""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('id', DataType.INTEGER), + ColumnMetadata('value', DataType.DECIMAL) + ] + ) + + def test_validate_with_none_value(self, metadata): + """Test validation with None value""" + check = CELCheck('value != null') + context = { + 'column_name': 'value', + 'record': [1, None], + 'metadata': metadata, + 'record_index': 0 + } + # Should handle None gracefully + error = check.validate(None, context) + # May return error depending on CEL null handling + assert error is not None or error is None + + def test_validate_without_metadata(self, metadata): + """Test validation without metadata (fallback mode)""" + check = CELCheck('value > 0') + context = { + 'column_name': 'value', + 'record': [1, 100], + 'metadata': None, # No metadata + 'record_index': 0 + } + # Should still work with basic validation + error = check.validate(100, context) + assert error is None + + def test_validate_without_record(self, metadata): + """Test validation without record data""" + check = CELCheck('value > 0') + context = { + 'column_name': 'value', + 'record': None, # No record + 'metadata': metadata, + 'record_index': 0 + } + # Should still work for simple value checks + error = check.validate(100, context) + assert error is None + + def test_validate_default_error_message(self, metadata): + """Test that default error message is generated""" + check = CELCheck('value > 100') # No custom error message + context = { + 'column_name': 'value', + 'record': [1, 50], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate(50, context) + assert error is not None + assert 'value > 100' in error.message + + def test_validate_with_missing_column_in_record(self, metadata): + """Test validation when record is shorter than metadata""" + check = CELCheck('value > 0') + context = { + 'column_name': 'value', + 'record': [1], # Missing second column + 'metadata': metadata, + 'record_index': 0 + } + # Should handle gracefully + error = check.validate(100, context) + # Should still validate the value itself + assert error is None + + +class TestCELCheckArithmeticOperations: + """Tests for CEL arithmetic operations""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('id', DataType.INTEGER), + ColumnMetadata('price', DataType.DECIMAL), + ColumnMetadata('quantity', DataType.INTEGER) + ] + ) + + def test_validate_addition(self, metadata): + """Test arithmetic addition""" + check = CELCheck('value == record.price + 10') + context = { + 'column_name': 'price', + 'record': [1, 100, 5], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for 110 (100 + 10) + assert check.validate(110, context) is None + # Should fail for other values + assert check.validate(100, context) is not None + + def test_validate_multiplication(self, metadata): + """Test arithmetic multiplication""" + check = CELCheck('value == record.price * record.quantity') + context = { + 'column_name': 'price', + 'record': [1, 100, 5], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for 500 (100 * 5) + assert check.validate(500, context) is None + # Should fail for other values + assert check.validate(100, context) is not None + + def test_validate_modulo(self, metadata): + """Test modulo operation""" + check = CELCheck('value % 10 == 0') + context = { + 'column_name': 'price', + 'record': [1, 100, 5], + 'metadata': metadata, + 'record_index': 0 + } + # Should pass for multiples of 10 + assert check.validate(100, context) is None + assert check.validate(50, context) is None + # Should fail for non-multiples + assert check.validate(105, context) is not None + + +class TestCELCheckIntegration: + """Integration tests with Validator""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL) + ] + ) + + def test_cel_check_with_validator(self, metadata): + """Test CELCheck integration with Validator""" + from wxdi.dq_validator import Validator, ValidationRule + + validator = Validator(metadata) + validator.add_rule( + ValidationRule('salary') + .add_check(CELCheck('value > record.min_salary')) + ) + + # Test with valid record + valid_record = [1001, 75000, 60000] + results = validator.validate_batch([valid_record]) + assert len(results) == 1 + assert results[0].is_valid + + # Test with invalid record + invalid_record = [1002, 50000, 60000] + results = validator.validate_batch([invalid_record]) + assert len(results) == 1 + assert not results[0].is_valid + assert len(results[0].errors) > 0 + +# Made with Bob + + def test_simple_syntax_column_reference(self, metadata): + """Test simple syntax without 'record.' prefix""" + check = CELCheck('value > min_salary') + context = { + 'column_name': 'salary', + 'record': [1001, 75000, 60000, 100000], + 'metadata': metadata, + 'record_index': 0 + } + assert check.validate(75000, context) is None + assert check.validate(50000, context) is not None + + def test_both_syntaxes_work_identically(self, metadata): + """Test that simple and explicit syntax produce same results""" + simple = CELCheck('value > min_salary') + explicit = CELCheck('value > record.min_salary') + context = { + 'column_name': 'salary', + 'record': [1001, 75000, 60000, 100000], + 'metadata': metadata, + 'record_index': 0 + } + # Both should pass + assert simple.validate(75000, context) is None + assert explicit.validate(75000, context) is None + # Both should fail + assert simple.validate(50000, context) is not None + assert explicit.validate(50000, context) is not None + + def test_simple_syntax_conditional(self, metadata): + """Test simple syntax in conditional expressions""" + check = CELCheck('age > 40 ? value >= 80000 : value >= 50000') + context = { + 'column_name': 'salary', + 'record': [1001, 85000, 60000, 45], + 'metadata': AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL), + ColumnMetadata('age', DataType.INTEGER) + ] + ), + 'record_index': 0 + } + assert check.validate(85000, context) is None + + +class TestCELCheckBindings: + """Test variable bindings for column-level CEL checks.""" + + def test_basic_binding(self): + """Test basic variable binding with single column.""" + # Create check with binding: 'value' -> 'salary' + check = CELCheck( + expression='current_value > 50000', + bindings={'current_value': 'salary'} + ) + + # Create metadata + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('salary', DataType.DECIMAL) + ] + ) + + # Test with passing value + context = { + 'column_name': 'salary', + 'record': [60000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(60000, context) + assert result is None + + # Test with failing value + context = { + 'column_name': 'salary', + 'record': [40000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(40000, context) + assert result is not None + + def test_multiple_bindings(self): + """Test multiple variable bindings in single expression.""" + # Create check with multiple bindings + check = CELCheck( + expression='current_value > minimum && current_value < maximum', + bindings={ + 'current_value': 'salary', + 'minimum': 'min_salary', + 'maximum': 'max_salary' + } + ) + + # Create metadata + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL), + ColumnMetadata('max_salary', DataType.DECIMAL) + ] + ) + + # Test with passing values + context = { + 'column_name': 'salary', + 'record': [60000, 50000, 70000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(60000, context) + assert result is None + + # Test with failing values (below minimum) + context = { + 'column_name': 'salary', + 'record': [40000, 50000, 70000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(40000, context) + assert result is not None + + # Test with failing values (above maximum) + context = { + 'column_name': 'salary', + 'record': [80000, 50000, 70000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(80000, context) + assert result is not None + + def test_binding_with_original_column_access(self): + """Test that bindings work alongside original column names.""" + # Create check using both binding and original column name + check = CELCheck( + expression='current_value > min_salary && salary < 100000', + bindings={'current_value': 'salary'} + ) + + # Create metadata + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL) + ] + ) + + # Test with passing values + context = { + 'column_name': 'salary', + 'record': [60000, 50000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(60000, context) + assert result is None + + def test_binding_missing_column(self): + """Test behavior when bound column doesn't exist.""" + # Create check with binding to non-existent column + check = CELCheck( + expression='current_value > 50000', + bindings={'current_value': 'nonexistent_column'} + ) + + # Create metadata without the bound column + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('salary', DataType.DECIMAL) + ] + ) + + # Should fail with evaluation error (variable not found) + context = { + 'column_name': 'salary', + 'record': [60000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(60000, context) + assert result is not None + assert 'current_value' in result.message.lower() or 'undefined' in result.message.lower() + + def test_empty_bindings(self): + """Test that empty bindings dict works (backward compatibility).""" + check = CELCheck( + expression='salary > 50000', + bindings={} + ) + + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('salary', DataType.DECIMAL) + ] + ) + + context = { + 'column_name': 'salary', + 'record': [60000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(60000, context) + assert result is None + + def test_none_bindings(self): + """Test that None bindings works (backward compatibility).""" + check = CELCheck( + expression='salary > 50000', + bindings=None + ) + + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('salary', DataType.DECIMAL) + ] + ) + + context = { + 'column_name': 'salary', + 'record': [60000], + 'metadata': metadata, + 'record_index': 0 + } + result = check.validate(60000, context) + assert result is None + + def test_invalid_bindings_type(self): + """Test that invalid bindings type raises error.""" + with pytest.raises(ValueError, match="bindings must be a dictionary"): + CELCheck( + expression='current_value > 50000', + bindings=['invalid'] # Should be dict, not list + ) + + def test_invalid_binding_key(self): + """Test that invalid binding key raises error.""" + with pytest.raises(ValueError, match="binding keys and values cannot be empty"): + CELCheck( + expression='current_value > 50000', + bindings={'': 'salary'} # Empty string key + ) + + def test_invalid_binding_value(self): + """Test that invalid binding value raises error.""" + with pytest.raises(ValueError, match="binding keys and values cannot be empty"): + CELCheck( + expression='current_value > 50000', + bindings={'current_value': ''} # Empty string value + ) + + +class TestCELCheckHelperMethodsCoverage: + """Tests for CELCheck helper methods to improve code coverage""" + + def test_validate_column_references_with_none_required_columns(self): + """Test validate_column_references when _required_columns is None""" + check = CELCheck('value > 0') + check._required_columns = None + # Should not raise error + check.validate_column_references(['col1', 'col2']) + + def test_validate_column_references_with_empty_available_columns(self): + """Test validate_column_references with empty available columns list""" + check = CELCheck('value > min_salary') + # Should not raise error when available_columns is empty + check.validate_column_references([]) + + def test_validate_column_references_returns_silently_when_no_required_columns(self): + """Test validate_column_references returns silently when _required_columns is None or empty""" + check = CELCheck('value > 100') + # When _required_columns is None/empty, should not raise error + check.validate_column_references(['some_col']) # Should not raise + check.validate_column_references([]) # Should not raise + + def test_complex_ast_with_nested_expressions(self): + """Test complex AST traversal with deeply nested expressions""" + check = CELCheck('(value > 0 && value < 100) || (value > 200 && value < 300)') + metadata = AssetMetadata( + table_name='test', + columns=[ColumnMetadata('value', DataType.INTEGER)] + ) + context = { + 'column_name': 'value', + 'record': [50], + 'metadata': metadata, + 'record_index': 0 + } + assert check.validate(50, context) is None + assert check.validate(150, context) is not None + assert check.validate(250, context) is None + + def test_record_field_access_in_expression(self): + """Test expressions with record.field access pattern""" + check = CELCheck('value > record.min_value && value < record.max_value') + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('value', DataType.INTEGER), + ColumnMetadata('min_value', DataType.INTEGER), + ColumnMetadata('max_value', DataType.INTEGER) + ] + ) + context = { + 'column_name': 'value', + 'record': [50, 0, 100], + 'metadata': metadata, + 'record_index': 0 + } + assert check.validate(50, context) is None + assert check.validate(-10, context) is not None + assert check.validate(150, context) is not None + + def test_validation_with_short_record(self): + """Test validation when record is shorter than metadata columns""" + check = CELCheck('value > 0') + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('col1', DataType.INTEGER), + ColumnMetadata('col2', DataType.STRING), + ColumnMetadata('col3', DataType.DECIMAL) + ] + ) + context = { + 'column_name': 'col1', + 'record': [100, 'test'], # Missing col3 + 'metadata': metadata, + 'record_index': 0 + } + # Should handle gracefully + assert check.validate(100, context) is None + + def test_validation_with_required_columns_optimization(self): + """Test that required_columns optimization works correctly""" + check = CELCheck('value > min_salary') + # Check should have extracted required columns + if check._required_columns: + assert 'min_salary' in check._required_columns or len(check._required_columns) == 0 + + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL), + ColumnMetadata('unused_col1', DataType.STRING), + ColumnMetadata('unused_col2', DataType.STRING) + ] + ) + context = { + 'column_name': 'salary', + 'record': [60000, 50000, 'unused1', 'unused2'], + 'metadata': metadata, + 'record_index': 0 + } + assert check.validate(60000, context) is None + + +class TestCELCheckErrorPathsCoverage: + """Tests for error paths and edge cases to improve coverage""" + + def test_validation_with_cel_evaluation_error(self): + """Test handling of CEL evaluation errors""" + check = CELCheck('value.nonexistent_method()') + metadata = AssetMetadata( + table_name='test', + columns=[ColumnMetadata('value', DataType.STRING)] + ) + context = { + 'column_name': 'value', + 'record': ['test'], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate('test', context) + assert error is not None + # Should contain error information + assert 'error' in error.message.lower() or 'failed' in error.message.lower() + + def test_validation_with_type_mismatch(self): + """Test validation with type mismatches""" + check = CELCheck('value > 100') + metadata = AssetMetadata( + table_name='test', + columns=[ColumnMetadata('value', DataType.STRING)] + ) + context = { + 'column_name': 'value', + 'record': ['not_a_number'], + 'metadata': metadata, + 'record_index': 0 + } + error = check.validate('not_a_number', context) + # Should handle type mismatch gracefully + assert error is not None + + def test_bindings_with_non_string_keys(self): + """Test that non-string binding keys raise error""" + with pytest.raises(ValueError, match="binding keys and values must be strings"): + CELCheck( + expression='current > 50000', + bindings={123: 'salary'} # Integer key instead of string + ) + + def test_bindings_with_non_string_values(self): + """Test that non-string binding values raise error""" + with pytest.raises(ValueError, match="binding keys and values must be strings"): + CELCheck( + expression='current > 50000', + bindings={'current': 123} # Integer value instead of string + ) + + def test_validate_column_references_with_all_columns_present(self): + """Test validate_column_references when all columns are present""" + check = CELCheck('record.age > 18') + + # Should not raise any error + check.validate_column_references(['age', 'name', 'email']) + + def test_validate_column_references_with_no_required_columns(self): + """Test validate_column_references with expression using only 'value'""" + check = CELCheck('value > 100') + + # Should not raise error even with empty column list + check.validate_column_references([]) + check.validate_column_references(['some', 'columns']) + + +# Made with Bob diff --git a/tests/src/dq_validator/test_cel_context.py b/tests/src/dq_validator/test_cel_context.py new file mode 100644 index 0000000..ba6cfbf --- /dev/null +++ b/tests/src/dq_validator/test_cel_context.py @@ -0,0 +1,431 @@ +""" + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import pytest +from wxdi.dq_validator.cel_context import CELContextBuilder +from wxdi.dq_validator.metadata import AssetMetadata, ColumnMetadata, DataType + + +class TestCELContextBuilderBasic: + """Tests for basic CELContextBuilder functionality""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING), + ColumnMetadata('salary', DataType.DECIMAL) + ] + ) + + def test_build_context_with_all_parameters(self, metadata): + """Test building context with all parameters""" + record = [1001, 'John Doe', 75000] + context = CELContextBuilder.build_context( + value=75000, + column_name='salary', + record=record, + metadata=metadata, + record_index=5 + ) + + assert context['value'] == 75000 + assert context['column_name'] == 'salary' + assert context['record_index'] == 5 + assert 'record' in context + + def test_build_context_minimal_parameters(self, metadata): + """Test building context with minimal parameters""" + context = CELContextBuilder.build_context( + value=100, + column_name='test', + record=None, + metadata=None + ) + + assert context['value'] == 100 + assert context['column_name'] == 'test' + assert context['record_index'] == 0 # Default value + assert context['record'] == {} # Empty dict when no record + + def test_build_context_default_record_index(self, metadata): + """Test that record_index defaults to 0""" + context = CELContextBuilder.build_context( + value=100, + column_name='test', + record=None, + metadata=None + ) + + assert context['record_index'] == 0 + + def test_build_context_with_custom_record_index(self, metadata): + """Test building context with custom record_index""" + context = CELContextBuilder.build_context( + value=100, + column_name='test', + record=None, + metadata=None, + record_index=42 + ) + + assert context['record_index'] == 42 + + +class TestCELContextBuilderRecordDict: + """Tests for record dictionary building""" + + @pytest.fixture + def metadata(self): + """Create test metadata""" + return AssetMetadata( + table_name='employees', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING), + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('department', DataType.STRING) + ] + ) + + def test_build_record_dict_complete_record(self, metadata): + """Test building record dict with complete record""" + record = [1001, 'John Doe', 75000, 'Engineering'] + context = CELContextBuilder.build_context( + value=75000, + column_name='salary', + record=record, + metadata=metadata + ) + + record_dict = context['record'] + # Check if it's a CEL MapType or dict + if hasattr(record_dict, '__getitem__'): + assert record_dict['emp_id'] == 1001 + assert record_dict['name'] == 'John Doe' + assert record_dict['salary'] == 75000 + assert record_dict['department'] == 'Engineering' + + def test_build_record_dict_partial_record(self, metadata): + """Test building record dict with partial record (fewer values than columns)""" + record = [1001, 'John Doe'] # Missing salary and department + context = CELContextBuilder.build_context( + value=1001, + column_name='emp_id', + record=record, + metadata=metadata + ) + + record_dict = context['record'] + if hasattr(record_dict, '__getitem__'): + assert record_dict['emp_id'] == 1001 + assert record_dict['name'] == 'John Doe' + # Missing columns should be None + assert record_dict['salary'] is None + assert record_dict['department'] is None + + def test_build_record_dict_without_metadata(self): + """Test building record dict without metadata (fallback mode)""" + record = [1001, 'John Doe', 75000] + context = CELContextBuilder.build_context( + value=75000, + column_name='salary', + record=record, + metadata=None + ) + + record_dict = context['record'] + # Should use positional fallback: col_0, col_1, col_2 + assert record_dict['col_0'] == 1001 + assert record_dict['col_1'] == 'John Doe' + assert record_dict['col_2'] == 75000 + + def test_build_record_dict_empty_record(self, metadata): + """Test building record dict with empty record""" + record = [] + context = CELContextBuilder.build_context( + value=100, + column_name='test', + record=record, + metadata=metadata + ) + + record_dict = context['record'] + # All columns should be None when record is empty + # The MapType will have the keys with None values + if hasattr(record_dict, '__getitem__'): + # Check that we can access the keys and they are None + try: + assert record_dict['emp_id'] is None + assert record_dict['name'] is None + assert record_dict['salary'] is None + assert record_dict['department'] is None + except KeyError: + # If MapType doesn't include None values, that's also acceptable + # as long as the record_dict exists + assert record_dict is not None + + def test_build_record_dict_none_record(self, metadata): + """Test building record dict with None record""" + context = CELContextBuilder.build_context( + value=100, + column_name='test', + record=None, + metadata=metadata + ) + + assert context['record'] == {} + + +class TestCELContextBuilderValidation: + """Tests for context validation""" + + def test_validate_context_valid(self): + """Test validation of valid context""" + context = { + 'value': 100, + 'column_name': 'test', + 'record': {}, + 'record_index': 0 + } + + assert CELContextBuilder.validate_context(context) is True + + def test_validate_context_missing_value(self): + """Test validation fails when value is missing""" + context = { + 'column_name': 'test', + 'record': {}, + 'record_index': 0 + } + + assert CELContextBuilder.validate_context(context) is False + + def test_validate_context_missing_column_name(self): + """Test validation fails when column_name is missing""" + context = { + 'value': 100, + 'record': {}, + 'record_index': 0 + } + + assert CELContextBuilder.validate_context(context) is False + + def test_validate_context_missing_record(self): + """Test validation fails when record is missing""" + context = { + 'value': 100, + 'column_name': 'test', + 'record_index': 0 + } + + assert CELContextBuilder.validate_context(context) is False + + def test_validate_context_empty(self): + """Test validation fails for empty context""" + context = {} + + assert CELContextBuilder.validate_context(context) is False + + def test_validate_context_extra_fields_ok(self): + """Test validation passes with extra fields""" + context = { + 'value': 100, + 'column_name': 'test', + 'record': {}, + 'record_index': 0, + 'extra_field': 'extra' + } + + assert CELContextBuilder.validate_context(context) is True + + +class TestCELContextBuilderUtilities: + """Tests for utility methods""" + + def test_get_available_variables(self): + """Test getting list of available variables""" + variables = CELContextBuilder.get_available_variables() + + assert 'value' in variables + assert 'record' in variables + assert 'column_name' in variables + assert 'record_index' in variables + assert len(variables) == 4 + + def test_get_available_variables_returns_list(self): + """Test that get_available_variables returns a list""" + variables = CELContextBuilder.get_available_variables() + + assert isinstance(variables, list) + + +class TestCELContextBuilderDataTypes: + """Tests for different data types in context""" + + @pytest.fixture + def metadata(self): + """Create test metadata with various data types""" + return AssetMetadata( + table_name='test_table', + columns=[ + ColumnMetadata('int_col', DataType.INTEGER), + ColumnMetadata('decimal_col', DataType.DECIMAL), + ColumnMetadata('string_col', DataType.STRING), + ColumnMetadata('bool_col', DataType.BOOLEAN) + ] + ) + + def test_build_context_with_integer(self, metadata): + """Test context building with integer value""" + record = [42, 3.14, 'test', True] + context = CELContextBuilder.build_context( + value=42, + column_name='int_col', + record=record, + metadata=metadata + ) + + assert context['value'] == 42 + assert isinstance(context['value'], int) + + def test_build_context_with_decimal(self, metadata): + """Test context building with decimal value""" + record = [42, 3.14, 'test', True] + context = CELContextBuilder.build_context( + value=3.14, + column_name='decimal_col', + record=record, + metadata=metadata + ) + + assert context['value'] == 3.14 + assert isinstance(context['value'], float) + + def test_build_context_with_string(self, metadata): + """Test context building with string value""" + record = [42, 3.14, 'test', True] + context = CELContextBuilder.build_context( + value='test', + column_name='string_col', + record=record, + metadata=metadata + ) + + assert context['value'] == 'test' + assert isinstance(context['value'], str) + + def test_build_context_with_boolean(self, metadata): + """Test context building with boolean value""" + record = [42, 3.14, 'test', True] + context = CELContextBuilder.build_context( + value=True, + column_name='bool_col', + record=record, + metadata=metadata + ) + + assert context['value'] is True + assert isinstance(context['value'], bool) + + def test_build_context_with_none(self, metadata): + """Test context building with None value""" + record = [42, None, 'test', True] + context = CELContextBuilder.build_context( + value=None, + column_name='decimal_col', + record=record, + metadata=metadata + ) + + assert context['value'] is None + + +class TestCELContextBuilderComplexScenarios: + """Tests for complex scenarios""" + + def test_build_context_large_record(self): + """Test building context with large record""" + # Create metadata with many columns + columns = [ColumnMetadata(f'col_{i}', DataType.INTEGER) for i in range(100)] + metadata = AssetMetadata(table_name='large_table', columns=columns) + + # Create large record + record = list(range(100)) + + context = CELContextBuilder.build_context( + value=50, + column_name='col_50', + record=record, + metadata=metadata + ) + + assert context['value'] == 50 + record_dict = context['record'] + if hasattr(record_dict, '__getitem__'): + assert record_dict['col_0'] == 0 + assert record_dict['col_50'] == 50 + assert record_dict['col_99'] == 99 + + def test_build_context_special_characters_in_names(self): + """Test building context with special characters in column names""" + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('col_with_underscore', DataType.INTEGER), + ColumnMetadata('col123', DataType.INTEGER) + ] + ) + + record = [100, 200] + context = CELContextBuilder.build_context( + value=100, + column_name='col_with_underscore', + record=record, + metadata=metadata + ) + + record_dict = context['record'] + if hasattr(record_dict, '__getitem__'): + assert record_dict['col_with_underscore'] == 100 + assert record_dict['col123'] == 200 + + def test_build_context_unicode_values(self): + """Test building context with unicode values""" + metadata = AssetMetadata( + table_name='test', + columns=[ + ColumnMetadata('id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING) + ] + ) + + record = [1, '日本語'] # Japanese characters + context = CELContextBuilder.build_context( + value='日本語', + column_name='name', + record=record, + metadata=metadata + ) + + assert context['value'] == '日本語' + record_dict = context['record'] + if hasattr(record_dict, '__getitem__'): + assert record_dict['name'] == '日本語' + +# Made with Bob diff --git a/tests/src/dq_validator/test_table_cel_check.py b/tests/src/dq_validator/test_table_cel_check.py new file mode 100644 index 0000000..31b90ab --- /dev/null +++ b/tests/src/dq_validator/test_table_cel_check.py @@ -0,0 +1,462 @@ +""" +Tests for table-level CEL validation. +""" + +import pytest +from wxdi.dq_validator import ( + AssetMetadata, ColumnMetadata, DataType, + Validator, TableValidationRule, TableCELCheck, + CELCompilationError, CELEvaluationError +) + + +@pytest.fixture +def metadata(): + """Create test metadata""" + return AssetMetadata( + table_name='test_table', + columns=[ + ColumnMetadata('emp_id', DataType.INTEGER), + ColumnMetadata('name', DataType.STRING), + ColumnMetadata('age', DataType.INTEGER), + ColumnMetadata('salary', DataType.DECIMAL), + ColumnMetadata('min_salary', DataType.DECIMAL), + ColumnMetadata('department', DataType.STRING), + ] + ) + + +class TestTableCELCheckInitialization: + """Test TableCELCheck initialization""" + + def test_valid_expression(self): + """Test initialization with valid expression""" + check = TableCELCheck('salary > min_salary') + assert check.expression == 'salary > min_salary' + assert check.get_check_name() == 'table_cel_check' + + def test_invalid_expression(self): + """Test initialization with invalid expression""" + with pytest.raises(CELCompilationError): + TableCELCheck('invalid syntax !') + + def test_custom_error_message(self): + """Test custom error message""" + check = TableCELCheck('age >= 18', error_message='Must be adult') + assert check.error_message == 'Must be adult' + + +class TestTableCELCheckValidation: + """Test TableCELCheck validation""" + + def test_simple_comparison_pass(self, metadata): + """Test simple comparison that passes""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(TableCELCheck('salary > min_salary')) + ) + + record = [1001, 'John', 30, 75000.00, 60000.00, 'Engineering'] + result = validator.validate(record) + + assert result.is_valid + assert len(result.errors) == 0 + + def test_simple_comparison_fail(self, metadata): + """Test simple comparison that fails""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(TableCELCheck('salary > min_salary')) + ) + + record = [1001, 'John', 30, 50000.00, 60000.00, 'Engineering'] + result = validator.validate(record) + + assert not result.is_valid + assert len(result.errors) == 1 + assert result.errors[0].column_name == 'salary_check' + + def test_conditional_logic_pass(self, metadata): + """Test conditional logic that passes""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('age_salary_check') + .add_check(TableCELCheck( + 'age > 40 ? salary >= 80000 : salary >= 50000' + )) + ) + + # Young employee with adequate salary + record1 = [1001, 'John', 30, 55000.00, 50000.00, 'Engineering'] + result1 = validator.validate(record1) + assert result1.is_valid + + # Senior employee with adequate salary + record2 = [1002, 'Jane', 45, 85000.00, 70000.00, 'Sales'] + result2 = validator.validate(record2) + assert result2.is_valid + + def test_conditional_logic_fail(self, metadata): + """Test conditional logic that fails""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('age_salary_check') + .add_check(TableCELCheck( + 'age > 40 ? salary >= 80000 : salary >= 50000' + )) + ) + + # Senior employee with inadequate salary + record = [1001, 'John', 45, 70000.00, 60000.00, 'Engineering'] + result = validator.validate(record) + + assert not result.is_valid + assert len(result.errors) == 1 + + def test_multiple_conditions_pass(self, metadata): + """Test multiple conditions that pass""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('multi_check') + .add_check(TableCELCheck( + 'salary > min_salary && age >= 18 && age <= 65' + )) + ) + + record = [1001, 'John', 30, 75000.00, 60000.00, 'Engineering'] + result = validator.validate(record) + + assert result.is_valid + + def test_multiple_conditions_fail(self, metadata): + """Test multiple conditions that fail""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('multi_check') + .add_check(TableCELCheck( + 'salary > min_salary && age >= 18 && age <= 65' + )) + ) + + # Age too young + record = [1001, 'John', 16, 75000.00, 60000.00, 'Engineering'] + result = validator.validate(record) + + assert not result.is_valid + + def test_string_operations(self, metadata): + """Test string operations in CEL""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('dept_check') + .add_check(TableCELCheck( + 'department in ["Engineering", "Sales", "HR"]' + )) + ) + + # Valid department + record1 = [1001, 'John', 30, 75000.00, 60000.00, 'Engineering'] + result1 = validator.validate(record1) + assert result1.is_valid + + # Invalid department + record2 = [1002, 'Jane', 30, 75000.00, 60000.00, 'Marketing'] + result2 = validator.validate(record2) + assert not result2.is_valid + + def test_arithmetic_operations(self, metadata): + """Test arithmetic operations in CEL""" + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('salary_calc') + .add_check(TableCELCheck( + 'salary >= min_salary * 1.2' + )) + ) + + # Salary is 1.25x minimum (passes) + record1 = [1001, 'John', 30, 75000.00, 60000.00, 'Engineering'] + result1 = validator.validate(record1) + assert result1.is_valid + + # Salary is only 1.1x minimum (fails) + record2 = [1002, 'Jane', 30, 66000.00, 60000.00, 'Sales'] + result2 = validator.validate(record2) + assert not result2.is_valid + + +class TestTableCELCheckMultipleRules: + """Test multiple table-level rules""" + + def test_multiple_table_rules(self, metadata): + """Test validator with multiple table rules""" + validator = Validator(metadata) + + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(TableCELCheck('salary > min_salary')) + ) + + validator.add_table_rule( + TableValidationRule('age_check') + .add_check(TableCELCheck('age >= 18 && age <= 65')) + ) + + # All rules pass + record1 = [1001, 'John', 30, 75000.00, 60000.00, 'Engineering'] + result1 = validator.validate(record1) + assert result1.is_valid + assert result1.total_checks == 2 + assert result1.passed_checks == 2 + + # One rule fails + record2 = [1002, 'Jane', 16, 75000.00, 60000.00, 'Sales'] + result2 = validator.validate(record2) + assert not result2.is_valid + assert result2.total_checks == 2 + assert result2.passed_checks == 1 + assert result2.failed_checks == 1 + + +class TestTableCELCheckColumnValidation: + """Test column reference validation""" + + def test_validate_column_references_valid(self, metadata): + """Test validation with valid column references""" + check = TableCELCheck('salary > min_salary && age >= 18') + + # Should not raise error + check.validate_column_references([c.name for c in metadata.columns]) + + def test_validate_column_references_invalid(self, metadata): + """Test validation with invalid column references""" + check = TableCELCheck('salary > max_salary') # max_salary doesn't exist + + # If column extraction works, should raise ValueError + # If extraction returns None (fallback), validation is skipped + if check._required_columns is not None: + with pytest.raises(ValueError) as exc_info: + check.validate_column_references([c.name for c in metadata.columns]) + + assert 'max_salary' in str(exc_info.value) + assert 'CASE-SENSITIVE' in str(exc_info.value) + else: + # Extraction failed - validation is skipped (safe fallback) + # This is acceptable behavior + check.validate_column_references([c.name for c in metadata.columns]) + + +class TestTableCELCheckPerformance: + """Test performance optimization""" + + def test_required_columns_extraction(self): + """Test that required columns are extracted from expression""" + check = TableCELCheck('salary > min_salary && age >= 18') + + # Column extraction is best-effort optimization + # If it works, verify the columns + # If it returns None, that's acceptable (uses all columns as fallback) + if check._required_columns is not None: + # Extraction succeeded - verify columns + assert 'salary' in check._required_columns + assert 'min_salary' in check._required_columns + assert 'age' in check._required_columns + else: + # Extraction returned None - acceptable fallback behavior + # All columns will be used in context (safe but less optimal) + pass + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) + + +class TestTableCELCheckBindings: + """Test variable bindings for table-level CEL checks.""" + + def test_table_basic_binding(self, metadata): + """Test basic variable binding with table-level check.""" + # Create check with binding: 'current_sal' -> 'salary' + check = TableCELCheck( + expression='current_sal > 50000', + bindings={'current_sal': 'salary'} + ) + + # Create validator with table rule + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(check) + ) + + # Test with passing value + record_pass = [1001, 'John', 30, 60000, 50000, 'Engineering'] + result = validator.validate(record_pass) + assert result.is_valid + + # Test with failing value + record_fail = [1002, 'Jane', 30, 40000, 50000, 'Engineering'] + result = validator.validate(record_fail) + assert not result.is_valid + + def test_table_multiple_bindings(self, metadata): + """Test multiple variable bindings in table-level check.""" + # Create check with multiple bindings + check = TableCELCheck( + expression='current_sal > minimum && person_age >= 18', + bindings={ + 'current_sal': 'salary', + 'minimum': 'min_salary', + 'person_age': 'age' + } + ) + + # Create validator + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('multi_check') + .add_check(check) + ) + + # Test with passing values + record_pass = [1001, 'John', 25, 60000, 50000, 'Engineering'] + result = validator.validate(record_pass) + assert result.is_valid + + # Test with failing values (below minimum) + record_fail1 = [1002, 'Jane', 25, 40000, 50000, 'Engineering'] + result = validator.validate(record_fail1) + assert not result.is_valid + + # Test with failing values (too young) + record_fail2 = [1003, 'Bob', 16, 60000, 50000, 'Engineering'] + result = validator.validate(record_fail2) + assert not result.is_valid + + def test_table_empty_bindings(self, metadata): + """Test that empty bindings dict works for table checks.""" + check = TableCELCheck( + expression='salary > 50000', + bindings={} + ) + + validator = Validator(metadata) + validator.add_table_rule( + TableValidationRule('salary_check') + .add_check(check) + ) + + record = [1001, 'John', 30, 60000, 50000, 'Engineering'] + result = validator.validate(record) + assert result.is_valid + + def test_table_invalid_bindings_type(self): + """Test that invalid bindings type raises error for table checks.""" + with pytest.raises(ValueError, match="bindings must be a dictionary"): + TableCELCheck( + expression='total > 100000', + bindings='invalid' # Should be dict + ) + + + +class TestTableCELCheckHelperMethods: + """Tests for TableCELCheck helper methods to improve coverage""" + + def test_validate_bindings_with_empty_string_key(self): + """Test that empty string keys in bindings raise error""" + with pytest.raises(ValueError, match="binding keys and values cannot be empty"): + TableCELCheck( + expression='current > 100', + bindings={'': 'salary'} # Empty key + ) + + def test_validate_bindings_with_empty_string_value(self): + """Test that empty string values in bindings raise error""" + with pytest.raises(ValueError, match="binding keys and values cannot be empty"): + TableCELCheck( + expression='current > 100', + bindings={'current': ''} # Empty value + ) + + def test_validate_bindings_with_non_string_key(self): + """Test that non-string keys in bindings raise error""" + with pytest.raises(ValueError, match="binding keys and values must be strings"): + TableCELCheck( + expression='current > 100', + bindings={123: 'salary'} # Integer key + ) + + def test_validate_bindings_with_non_string_value(self): + """Test that non-string values in bindings raise error""" + with pytest.raises(ValueError, match="binding keys and values must be strings"): + TableCELCheck( + expression='current > 100', + bindings={'current': 456} # Integer value + ) + + def test_extract_column_references_with_bindings(self): + """Test column extraction when bindings are used""" + check = TableCELCheck( + expression='current_sal > minimum', + bindings={'current_sal': 'salary', 'minimum': 'min_salary'} + ) + # Should extract the actual column names from bindings + if check._required_columns: + assert 'salary' in check._required_columns or 'min_salary' in check._required_columns + + def test_validation_with_cel_evaluation_error(self, metadata): + """Test handling of CEL evaluation errors in table checks""" + check = TableCELCheck('salary.nonexistent_method()') + # Provide proper record format for table CEL check + record = [1001, 'John', 30, 75000.00, 60000.00, 'Engineering'] + context = { + 'record': record, + 'metadata': metadata, + 'record_index': 0 + } + # Table CEL checks raise CELEvaluationError for evaluation failures + with pytest.raises(CELEvaluationError, match="CEL evaluation failed"): + check.validate(None, context) + + +class TestTableCELCheckEdgeCases: + """Tests for edge cases in table CEL checks""" + + def test_validate_with_none_record(self, metadata): + """Test validation when record is None""" + check = TableCELCheck('salary > 50000') + context = { + 'record': None, + 'metadata': metadata + } + # Should raise ValueError for missing record + with pytest.raises(ValueError, match="requires 'record' and 'metadata'"): + check.validate(None, context) + + def test_validate_with_missing_record_key(self, metadata): + """Test validation when 'record' key is missing from context""" + check = TableCELCheck('salary > 50000') + context = { + 'metadata': metadata + } + # Should raise ValueError for missing record + with pytest.raises(ValueError, match="requires 'record' and 'metadata'"): + check.validate(None, context) + + def test_validate_with_missing_metadata(self): + """Test validation when metadata is missing""" + check = TableCELCheck('salary > 50000') + record = [1001, 'John', 30, 75000.00, 60000.00, 'Engineering'] + context = { + 'record': record + } + # Should raise ValueError for missing metadata + with pytest.raises(ValueError, match="requires 'record' and 'metadata'"): + check.validate(None, context) + + +# Made with Bob +# Made with Bob