diff --git a/python-pointblank/README.md b/python-pointblank/README.md new file mode 100644 index 0000000000..2fae88f76f --- /dev/null +++ b/python-pointblank/README.md @@ -0,0 +1,24 @@ +# Validating Data With Pointblank in Python + +Supporting code and sample data for the Real Python tutorial +"Validating Data With Pointblank in Python". + +## Requirements + +The Python scripts use PEP 723 dependency metadata and run with +[uv](https://docs.astral.sh/uv/): + +```console +$ uv run pointblank_quickstart.py +$ uv run pointblank_thresholds.py +$ uv run pointblank_atoms.py +``` + +The command-line examples can run without a project environment: + +```console +$ uv run --no-project --with 'pointblank[pl]' -- pb scan pointblank_atoms.csv +$ uv run --no-project --with 'pointblank[pl]' -- pb missing pointblank_atoms.csv +$ uvx --from 'pointblank[pl]' pb run pointblank_atoms.yaml --output-html pointblank_report.html +``` + diff --git a/python-pointblank/pointblank_atoms.csv b/python-pointblank/pointblank_atoms.csv new file mode 100644 index 0000000000..f558fa9e69 --- /dev/null +++ b/python-pointblank/pointblank_atoms.csv @@ -0,0 +1,14 @@ +atom_id,symbol,x,y,z,fx,fy,fz +0,Cu,1.0,0.5,0.1,0.1,0.0,0.0 +1,Pt,2.1,1.5,0.2,-0.2,0.1,-0.1 +2,Cu,3.2,2.5,0.3,0.3,-0.1,0.1 +3,Pt,4.3,3.5,0.4,-0.1,0.0,0.0 +4,Cu,5.4,4.5,0.5,0.2,0.1,-0.1 +5,Pt,6.5,5.5,0.6,-0.3,-0.1,0.1 +6,Cu,7.6,6.5,0.7,0.1,0.0,0.0 +7,Pt,8.7,7.5,0.8,-0.2,0.1,-0.1 +8,Cu,9.8,8.5,0.9,0.3,-0.1,0.1 +9,Pt,10.9,9.5,1.0,-0.1,0.0,0.0 +10,Zz,0.5,0.5,0.1,0.0,0.0,0.0 +11,Cu,,1.5,0.2,0.0,0.0,0.0 +12,Pt,12.1,2.5,0.3,1500.0,0.0,0.0 diff --git a/python-pointblank/pointblank_atoms.py b/python-pointblank/pointblank_atoms.py new file mode 100644 index 0000000000..a1bec97dba --- /dev/null +++ b/python-pointblank/pointblank_atoms.py @@ -0,0 +1,43 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pointblank[pl]", +# ] +# /// + +import polars as pl +import pointblank as pb + +VALID_ELEMENTS = ["Cu", "Pt"] + + +def main() -> None: + atoms = pl.read_csv("pointblank_atoms.csv") + + validation = ( + pb.Validate( + data=atoms, + tbl_name="atoms_from_parser", + label="Round-trip validation before re-export", + thresholds=pb.Thresholds(warning=0.02, error=0.05, critical=0.07), + ) + .col_vals_in_set(columns="symbol", set=VALID_ELEMENTS) + .col_vals_not_null(columns=["x", "y", "z"]) + .col_vals_between(columns=["x", "y", "z"], left=0, right=20) + .col_vals_between(columns="fx", left=-1000, right=1000) + .interrogate() + ) + + clean = validation.get_sundered_data(type="pass") + dirty = validation.get_sundered_data(type="fail") + + print(f"Safe to re-export: {len(clean)} rows") + print(f"Needs review: {len(dirty)} rows") + print("\nClean rows") + print(clean.select(["atom_id", "symbol", "x", "fx"])) + print("\nDirty rows") + print(dirty.select(["atom_id", "symbol", "x", "fx"])) + + +if __name__ == "__main__": + main() diff --git a/python-pointblank/pointblank_atoms.yaml b/python-pointblank/pointblank_atoms.yaml new file mode 100644 index 0000000000..edeac47fa8 --- /dev/null +++ b/python-pointblank/pointblank_atoms.yaml @@ -0,0 +1,22 @@ +tbl: pointblank_atoms.csv +df_library: polars +tbl_name: "Atom Validation" +label: "Tutorial YAML validation" +thresholds: + warning: 0.02 + error: 0.05 + critical: 0.07 +steps: + - col_vals_in_set: + columns: symbol + set: [Cu, Pt] + - col_vals_not_null: + columns: [x, y, z] + - col_vals_between: + columns: [x, y, z] + left: 0 + right: 20 + - col_vals_between: + columns: fx + left: -1000 + right: 1000 diff --git a/python-pointblank/pointblank_quickstart.py b/python-pointblank/pointblank_quickstart.py new file mode 100644 index 0000000000..e51b4875eb --- /dev/null +++ b/python-pointblank/pointblank_quickstart.py @@ -0,0 +1,39 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pointblank[pl]", +# ] +# /// + +import pointblank as pb + + +def main() -> None: + validation = ( + pb.Validate( + data=pb.load_dataset("small_table", tbl_type="polars"), + tbl_name="small_table", + label="Quickstart validation", + ) + .col_vals_between(columns="d", left=0, right=5000) + .col_vals_in_set(columns="f", set=["low", "mid", "high"]) + .col_vals_not_null(columns="c") + .interrogate() + ) + + report = validation.get_dataframe_report() + summary = report.select( + ["step_description", "pass_n", "failed_n"] + ).iter_rows(named=True) + + print("Validation summary:\n") + for step in summary: + print( + f"{step['step_description']:20}" + f"passed={step['pass_n']:<4}" + f"failed={step['failed_n']}" + ) + + +if __name__ == "__main__": + main() diff --git a/python-pointblank/pointblank_report.html b/python-pointblank/pointblank_report.html new file mode 100644 index 0000000000..6dae8b9841 --- /dev/null +++ b/python-pointblank/pointblank_report.html @@ -0,0 +1,443 @@ +