Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion dataframe.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,17 @@ library
DataFrame.Lazy.IO.CSV,
DataFrame.Lazy.Internal.DataFrame,
DataFrame.Monad,
DataFrame.DecisionTree
DataFrame.DecisionTree,
DataFrame.Typed.Types,
DataFrame.Typed.Schema,
DataFrame.Typed.Freeze,
DataFrame.Typed.Access,
DataFrame.Typed.Operations,
DataFrame.Typed.Join,
DataFrame.Typed.Aggregate,
DataFrame.Typed.TH,
DataFrame.Typed.Expr,
DataFrame.Typed
build-depends: base >= 4 && <5,
aeson >= 0.11.0.0 && < 3,
array >= 0.5.4.0 && < 0.6,
Expand Down
111 changes: 34 additions & 77 deletions src/DataFrame/Internal/Column.hs
Original file line number Diff line number Diff line change
Expand Up @@ -620,83 +620,40 @@ zipColumns (OptionalColumn optcolumn) (OptionalColumn optother) = BoxedColumn (V

-- | Merge two columns using `These`.
mergeColumns :: Column -> Column -> Column
mergeColumns (BoxedColumn column) (BoxedColumn other) = BoxedColumn (VG.zipWith These column other)
mergeColumns (BoxedColumn column) (UnboxedColumn other) =
BoxedColumn
( VB.generate
(min (VG.length column) (VG.length other))
(\i -> These (column VG.! i) (other VG.! i))
)
mergeColumns (BoxedColumn column) (OptionalColumn other) =
BoxedColumn
( VB.generate
(min (VG.length column) (VG.length other))
( \i ->
if isNothing (other VG.! i)
then This (column VG.! i)
else These (column VG.! i) (fromJust $ other VG.! i)
)
)
mergeColumns (UnboxedColumn column) (BoxedColumn other) =
BoxedColumn
( VB.generate
(min (VG.length column) (VG.length other))
(\i -> These (column VG.! i) (other VG.! i))
)
mergeColumns (UnboxedColumn column) (UnboxedColumn other) =
BoxedColumn
( VB.generate
(min (VG.length column) (VG.length other))
(\i -> These (column VG.! i) (other VG.! i))
)
mergeColumns (UnboxedColumn column) (OptionalColumn other) =
BoxedColumn
( VB.generate
(min (VG.length column) (VG.length other))
( \i ->
if isNothing (other VG.! i)
then This (column VG.! i)
else These (column VG.! i) (fromJust $ other VG.! i)
)
)
mergeColumns (OptionalColumn column) (BoxedColumn other) =
BoxedColumn
( VB.generate
(min (VG.length column) (VG.length other))
( \i ->
if isNothing (column VG.! i)
then That (other VG.! i)
else These (fromJust $ column VG.! i) (other VG.! i)
)
)
mergeColumns (OptionalColumn column) (UnboxedColumn other) =
BoxedColumn
( VB.generate
(min (VG.length column) (VG.length other))
( \i ->
if isNothing (column VG.! i)
then That (other VG.! i)
else These (fromJust $ column VG.! i) (other VG.! i)
)
)
mergeColumns (OptionalColumn column) (OptionalColumn other) =
OptionalColumn
( VB.generate
(min (VG.length column) (VG.length other))
( \i ->
if isNothing (column VG.! i) && isNothing (other VG.! i)
then Nothing
else
( if isNothing (column VG.! i)
then Just (That (fromJust $ other VG.! i))
else
( if isNothing (other VG.! i)
then Just (This (fromJust $ column VG.! i))
else Just (These (fromJust $ column VG.! i) (fromJust $ other VG.! i))
)
)
)
)
mergeColumns colA colB = case (colA, colB) of
(OptionalColumn c1, OptionalColumn c2) ->
OptionalColumn $ mkVec c1 c2 $ \v1 v2 ->
case (v1, v2) of
(Nothing, Nothing) -> Nothing
(Just x, Nothing) -> Just (This x)
(Nothing, Just y) -> Just (That y)
(Just x, Just y) -> Just (These x y)
(OptionalColumn c1, BoxedColumn c2) -> optReq c1 c2
(OptionalColumn c1, UnboxedColumn c2) -> optReq c1 c2
(BoxedColumn c1, OptionalColumn c2) -> reqOpt c1 c2
(UnboxedColumn c1, OptionalColumn c2) -> reqOpt c1 c2
(BoxedColumn c1, BoxedColumn c2) -> reqReq c1 c2
(BoxedColumn c1, UnboxedColumn c2) -> reqReq c1 c2
(UnboxedColumn c1, BoxedColumn c2) -> reqReq c1 c2
(UnboxedColumn c1, UnboxedColumn c2) -> reqReq c1 c2
where
mkVec c1 c2 combineElements =
VB.generate
(min (VG.length c1) (VG.length c2))
(\i -> combineElements (c1 VG.! i) (c2 VG.! i))
{-# INLINE mkVec #-}

reqReq c1 c2 = BoxedColumn $ mkVec c1 c2 These

reqOpt c1 c2 = BoxedColumn $ mkVec c1 c2 $ \v1 v2 ->
case v2 of
Nothing -> This v1
Just y -> These v1 y

optReq c1 c2 = BoxedColumn $ mkVec c1 c2 $ \v1 v2 ->
case v1 of
Nothing -> That v2
Just x -> These x v2
{-# INLINE mergeColumns #-}

-- | An internal, column version of zipWith.
Expand Down
221 changes: 221 additions & 0 deletions src/DataFrame/Typed.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
{-# LANGUAGE DataKinds #-}

{- |
Module : DataFrame.Typed
Copyright : (c) 2025
License : MIT
Maintainer : mschavinda@gmail.com
Stability : experimental

A type-safe layer over the @dataframe@ library.

This module provides 'TypedDataFrame', a phantom-typed wrapper around
the untyped 'DataFrame' that tracks column names and types at compile time.
All operations delegate to the untyped core at runtime; the phantom type
is updated at compile time to reflect schema changes.

== Key difference from untyped API: TExpr

All expression-taking operations use 'TExpr' (typed expressions) instead
of raw @Expr@. Column references are validated at compile time:

@
{\-\# LANGUAGE DataKinds, TypeApplications, TypeOperators \#-\}
import qualified DataFrame.Typed as T

type People = '[T.Column \"name\" Text, T.Column \"age\" Int]

main = do
raw <- D.readCsv \"people.csv\"
case T.freeze \@People raw of
Nothing -> putStrLn \"Schema mismatch!\"
Just df -> do
let adults = T.filterWhere (T.col \@\"age\" T..>=. T.lit 18) df
let names = T.columnAsList \@\"name\" adults -- :: [Text]
print names
@

Column references like @T.col \@\"age\"@ are checked at compile time — if the
column doesn't exist or has the wrong type, you get a type error, not a
runtime exception.

== filterAllJust tracks Maybe-stripping

@
df :: TypedDataFrame '[Column \"x\" (Maybe Double), Column \"y\" Int]
T.filterAllJust df :: TypedDataFrame '[Column \"x\" Double, Column \"y\" Int]
@

== Typed aggregation (Option B)

@
result = T.aggregate
(T.agg \@\"total\" (T.tsum (T.col \@\"salary\"))
$ T.agg \@\"count\" (T.tcount (T.col \@\"salary\"))
$ T.aggNil)
(T.groupBy \@'[\"dept\"] employees)
@
-}
module DataFrame.Typed (
-- * Core types
TypedDataFrame,
Column,
TypedGrouped,
These (..),

-- * Typed expressions
TExpr (..),
col,
lit,
ifThenElse,
lift,
lift2,

-- * Comparison operators
(.==.),
(./=.),
(.<.),
(.<=.),
(.>=.),
(.>.),

-- * Logical operators
(.&&.),
(.||.),
DataFrame.Typed.Expr.not,

-- * Aggregation expression combinators
DataFrame.Typed.Expr.sum,
mean,
count,
DataFrame.Typed.Expr.minimum,
DataFrame.Typed.Expr.maximum,
collect,

-- * Typed sort orders
TSortOrder (..),
asc,
desc,

-- * Named expression helper
DataFrame.Typed.Expr.as,

-- * Freeze / thaw boundary
freeze,
freezeWithError,
thaw,
unsafeFreeze,

-- * Typed column access
columnAsVector,
columnAsList,

-- * Schema-preserving operations
filterWhere,
filter,
filterBy,
filterAllJust,
filterJust,
filterNothing,
sortBy,
take,
takeLast,
drop,
dropLast,
range,
cube,
distinct,
sample,
shuffle,

-- * Schema-modifying operations
derive,
impute,
select,
exclude,
rename,
renameMany,
insert,
insertColumn,
insertVector,
cloneColumn,
dropColumn,
replaceColumn,

-- * Metadata
dimensions,
nRows,
nColumns,
columnNames,

-- * Vertical merge
append,

-- * Joins
innerJoin,
leftJoin,
rightJoin,
fullOuterJoin,

-- * GroupBy and Aggregation (Option B)
groupBy,
agg,
aggNil,
aggregate,
aggregateUntyped,

-- * Template Haskell
deriveSchema,
deriveSchemaFromCsvFile,

-- * Schema type families (for advanced use)
Lookup,
HasName,
SubsetSchema,
ExcludeSchema,
RenameInSchema,
RemoveColumn,
Impute,
Append,
Reverse,
StripAllMaybe,
StripMaybeAt,
GroupKeyColumns,
InnerJoinSchema,
LeftJoinSchema,
RightJoinSchema,
FullOuterJoinSchema,
AssertAbsent,
AssertPresent,

-- * Constraints
KnownSchema (..),
AllKnownSymbol (..),

-- * Pipe operator
(|>),
) where

import Prelude hiding (drop, filter, take)

import DataFrame.Typed.Access (columnAsList, columnAsVector)
import DataFrame.Typed.Aggregate (
agg,
aggNil,
aggregate,
aggregateUntyped,
groupBy,
)
import DataFrame.Typed.Expr
import DataFrame.Typed.Freeze (freeze, freezeWithError, thaw, unsafeFreeze)
import DataFrame.Typed.Join (fullOuterJoin, innerJoin, leftJoin, rightJoin)
import DataFrame.Typed.Operations
import DataFrame.Typed.Schema
import DataFrame.Typed.TH (deriveSchema, deriveSchemaFromCsvFile)
import DataFrame.Typed.Types (
Column,
TSortOrder (..),
These (..),
TypedDataFrame,
TypedGrouped,
)
Loading