Skip to content

Commit 94fe93b

Browse files
authored
feat: narwhals-compliant dataframe setup (#112)
Adds minimal setup for narwhals-complaint dataframe (lazyframe).
1 parent bddb47b commit 94fe93b

File tree

7 files changed

+273
-73
lines changed

7 files changed

+273
-73
lines changed

examples/dataframe_example.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from substrait.builders.plan import read_named_table
2+
from substrait.builders.type import i64, boolean, struct, named_struct
3+
from substrait.extension_registry import ExtensionRegistry
4+
import substrait.dataframe as sdf
5+
6+
registry = ExtensionRegistry(load_default_extensions=True)
7+
8+
ns = named_struct(
9+
names=["id", "is_applicable"],
10+
struct=struct(types=[i64(nullable=False), boolean()], nullable=False),
11+
)
12+
13+
table = read_named_table("example_table", ns)
14+
15+
frame = sdf.DataFrame(read_named_table("example_table", ns))
16+
frame = frame.select(sdf.col("id"))
17+
print(frame.to_substrait(registry))

examples/narwhals_example.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Install duckdb and pyarrow before running this example
2+
# /// script
3+
# dependencies = [
4+
# "narwhals==2.9.0",
5+
# "substrait[extensions] @ file:///${PROJECT_ROOT}/"
6+
# ]
7+
# ///
8+
9+
from substrait.builders.plan import read_named_table
10+
from substrait.builders.type import i64, boolean, struct, named_struct
11+
from substrait.extension_registry import ExtensionRegistry
12+
13+
from narwhals.typing import FrameT
14+
import narwhals as nw
15+
import substrait.dataframe as sdf
16+
17+
18+
registry = ExtensionRegistry(load_default_extensions=True)
19+
20+
ns = named_struct(
21+
names=["id", "is_applicable"],
22+
struct=struct(types=[i64(nullable=False), boolean()], nullable=False),
23+
)
24+
25+
table = read_named_table("example_table", ns)
26+
27+
28+
lazy_frame: FrameT = nw.from_native(
29+
sdf.DataFrame(read_named_table("example_table", ns))
30+
)
31+
32+
lazy_frame = lazy_frame.select(nw.col("id").abs(), new_id=nw.col("id"))
33+
34+
df: sdf.DataFrame = lazy_frame.to_native()
35+
36+
print(df.to_substrait(registry))
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import substrait.dataframe
2+
from substrait.builders.extended_expression import column
3+
4+
from substrait.dataframe.dataframe import DataFrame
5+
from substrait.dataframe.expression import Expression
6+
7+
__all__ = [DataFrame, Expression]
8+
9+
10+
def col(name: str) -> Expression:
11+
"""Column selection."""
12+
return Expression(column(name))
13+
14+
# TODO handle str_as_lit argument
15+
def parse_into_expr(expr, str_as_lit: bool):
16+
return expr._to_compliant_expr(substrait.dataframe)
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from typing import Union, Iterable
2+
import substrait.dataframe
3+
from substrait.builders.plan import select
4+
from substrait.dataframe.expression import Expression
5+
6+
7+
class DataFrame:
8+
def __init__(self, plan):
9+
self.plan = plan
10+
self._native_frame = self
11+
12+
def to_substrait(self, registry):
13+
return self.plan(registry)
14+
15+
def __narwhals_lazyframe__(self) -> "DataFrame":
16+
"""Return object implementing CompliantDataFrame protocol."""
17+
return self
18+
19+
def __narwhals_namespace__(self):
20+
"""
21+
Return the namespace object that contains functions like col, lit, etc.
22+
This is how Narwhals knows which backend's functions to use.
23+
"""
24+
return substrait.dataframe
25+
26+
def select(
27+
self, *exprs: Union[Expression, Iterable[Expression]], **named_exprs: Expression
28+
) -> "DataFrame":
29+
expressions = [e.expr for e in exprs] + [
30+
expr.alias(alias).expr for alias, expr in named_exprs.items()
31+
]
32+
return DataFrame(select(self.plan, expressions=expressions))
33+
34+
# TODO handle version
35+
def _with_version(self, version):
36+
return self
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from substrait.builders.extended_expression import (
2+
UnboundExtendedExpression,
3+
ExtendedExpressionOrUnbound,
4+
resolve_expression,
5+
scalar_function
6+
)
7+
import substrait.gen.proto.type_pb2 as stp
8+
import substrait.gen.proto.extended_expression_pb2 as stee
9+
from substrait.extension_registry import ExtensionRegistry
10+
11+
12+
def _alias(
13+
expr: ExtendedExpressionOrUnbound,
14+
alias: str = None,
15+
):
16+
def resolve(
17+
base_schema: stp.NamedStruct, registry: ExtensionRegistry
18+
) -> stee.ExtendedExpression:
19+
bound_expression = resolve_expression(expr, base_schema, registry)
20+
bound_expression.referred_expr[0].output_names[0] = alias
21+
return bound_expression
22+
23+
return resolve
24+
25+
26+
class Expression:
27+
def __init__(self, expr: UnboundExtendedExpression):
28+
self.expr = expr
29+
30+
def alias(self, alias: str):
31+
self.expr = _alias(self.expr, alias)
32+
return self
33+
34+
def abs(self):
35+
self.expr = scalar_function("functions_arithmetic.yaml", "abs", expressions=[self.expr])
36+
return self

0 commit comments

Comments
 (0)