From 7fb5adfb53053f195ddf83a2dfd8166b91906164 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 27 May 2017 21:09:26 +0100 Subject: [PATCH 1/8] Update has_dtypes to allow functions as dict values of parametr items A change to allow has_dtypes parameter ``items`` to have functions as dict values. A few discussion point: I check for ``FunctionType`` not whether the input is callable. This is because variuos types are themselves callables (``int``, ``float`` etc.). Secondly, the check function must return ``True`` to pass. A truthy value is not enough. I'm thinking that users will sometimes return the series or dtype, and explicitly requiring ``True`` will minimize some errors (or so my thinking goes). Thirdly, the check function checks the *series*, not the series' dtype. My thinking on this is that by checking the series rather than the dtype only, the function can also check orderedness and categories on a categorical, and these checks seem logical to place in ``has_dtypes``. I can also see the counter argument, that we're not strictly checking ``dtype``, but ``pd.api.types`` accepts both series and dtypes, so no great loss should come from using series. I can write some tests and update docs, if this proposal is accepted. See #36 for the issue. --- engarde/checks.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/engarde/checks.py b/engarde/checks.py index 8819d3c..ae3ba23 100644 --- a/engarde/checks.py +++ b/engarde/checks.py @@ -8,6 +8,8 @@ - Makes its assert on the result - Return the original DataFrame """ +from types import FunctionType + import numpy as np import pandas as pd import pandas.util.testing as tm @@ -197,21 +199,31 @@ def within_n_std(df, n=3): def has_dtypes(df, items): """ - Assert that a DataFrame has ``dtypes`` - + Assert that a DataFrame has ``dtypes``. + Parameters ========== df: DataFrame items: dict - mapping of columns to dtype. - + A mapping of column names to: + - dtypes, and/or + - functions (**not** other callables!) that take a pandas.Series instance as input, and + return ``True`` if the Series has the correct dtypes and ``False`` otherwise. Returns ======= df : DataFrame """ dtypes = df.dtypes for k, v in items.items(): - if not dtypes[k] == v: + if isinstance(v, FunctionType): + result = v(df[k]) + if result is True: + continue + raise AssertionError("{}'s function returned {!r}." + " Must return ``True`` to pass the check".format(k, v)) + if not result: + raise AssertionError("The function for {} returned {}".format(k, result)) + elif not dtypes[k] == v: raise AssertionError("{} has the wrong dtype ({})".format(k, v)) return df From 73a9d55e434f3c49c2d315fb48bac0865bada6cb Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 27 May 2017 21:18:25 +0100 Subject: [PATCH 2/8] Update checks.py Small cleanup --- engarde/checks.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/engarde/checks.py b/engarde/checks.py index ae3ba23..e3c845a 100644 --- a/engarde/checks.py +++ b/engarde/checks.py @@ -217,12 +217,9 @@ def has_dtypes(df, items): for k, v in items.items(): if isinstance(v, FunctionType): result = v(df[k]) - if result is True: - continue - raise AssertionError("{}'s function returned {!r}." + if result is not True: + raise AssertionError("{}'s function returned {!r}." " Must return ``True`` to pass the check".format(k, v)) - if not result: - raise AssertionError("The function for {} returned {}".format(k, result)) elif not dtypes[k] == v: raise AssertionError("{} has the wrong dtype ({})".format(k, v)) return df From da5cffeb60cfe5183864bc70ecb6944391a45c6a Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 27 May 2017 21:42:29 +0100 Subject: [PATCH 3/8] check pd.Series.dtype, not pd.Series. check ``pd.Series.dtype``, not ``pd.Series``. --- engarde/checks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/engarde/checks.py b/engarde/checks.py index e3c845a..97a71a6 100644 --- a/engarde/checks.py +++ b/engarde/checks.py @@ -207,8 +207,8 @@ def has_dtypes(df, items): items: dict A mapping of column names to: - dtypes, and/or - - functions (**not** other callables!) that take a pandas.Series instance as input, and - return ``True`` if the Series has the correct dtypes and ``False`` otherwise. + - functions (**not** other callables!) that take a pandas.Series.dtype instance as input, and + return ``True`` if the ``dtype`` has the correct dtype ``False`` otherwise. Returns ======= df : DataFrame @@ -216,7 +216,7 @@ def has_dtypes(df, items): dtypes = df.dtypes for k, v in items.items(): if isinstance(v, FunctionType): - result = v(df[k]) + result = v(dtypes[k]) if result is not True: raise AssertionError("{}'s function returned {!r}." " Must return ``True`` to pass the check".format(k, v)) From 4470e83f6b4957fca05d08310b7882ece3f44966 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 27 May 2017 22:05:56 +0100 Subject: [PATCH 4/8] Update checks.py --- engarde/checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engarde/checks.py b/engarde/checks.py index 97a71a6..91a6862 100644 --- a/engarde/checks.py +++ b/engarde/checks.py @@ -219,7 +219,7 @@ def has_dtypes(df, items): result = v(dtypes[k]) if result is not True: raise AssertionError("{}'s function returned {!r}." - " Must return ``True`` to pass the check".format(k, v)) + " Must return True to pass the check".format(k, result)) elif not dtypes[k] == v: raise AssertionError("{} has the wrong dtype ({})".format(k, v)) return df From 2885137d20eb497dba0d4c8797d1d8f41009a63f Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 6 Jun 2017 16:46:00 +0100 Subject: [PATCH 5/8] Making errors clearer Clearer errors for users. --- engarde/checks.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/engarde/checks.py b/engarde/checks.py index 91a6862..e28f962 100644 --- a/engarde/checks.py +++ b/engarde/checks.py @@ -8,7 +8,6 @@ - Makes its assert on the result - Return the original DataFrame """ -from types import FunctionType import numpy as np import pandas as pd @@ -213,14 +212,18 @@ def has_dtypes(df, items): ======= df : DataFrame """ + from types import FunctionType + from pandas.api.types import is_dtype_equal dtypes = df.dtypes for k, v in items.items(): if isinstance(v, FunctionType): result = v(dtypes[k]) - if result is not True: - raise AssertionError("{}'s function returned {!r}." - " Must return True to pass the check".format(k, result)) - elif not dtypes[k] == v: + if not isinstance(result, bool): + raise AssertionError("The function for key {!r}" + " must return a boolean, returned {!r}".format(k, type(result))) + if not result: + raise AssertionError("{} has the wrong dtype ({}) for function ({})".format(k, dtypes[k], v.__name__)) + elif not is_dtype_equal(dtypes[k], v): raise AssertionError("{} has the wrong dtype ({})".format(k, v)) return df From 82049d325fb661398c87f2418537b741fba83072 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 6 Jun 2017 22:44:02 +0100 Subject: [PATCH 6/8] add test for has_dtypes with function as items values --- tests/test_checks.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_checks.py b/tests/test_checks.py index b7450c8..8f9a5c5 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -219,6 +219,26 @@ def test_has_dtypes(): with pytest.raises(AssertionError): dc.has_dtypes(items={'A': bool})(_noop)(df) +def test_has_dtypes_funcs(): + pat = pd.api.types + + df = pd.DataFrame({'A': np.random.randint(0, 10, 10), + 'B': np.random.randn(10), + 'C': list('abcdefghij'), + 'D': pd.Categorical(np.random.choice(['a', 'b'], 10))}) + dtypes = {'A': pat.is_integer_dtype, + 'B': pat.is_float_dtype, + 'C': pat.is_string_dtype, + 'D': pat.is_category_dtype} + tm.assert_frame_equal(df, ck.has_dtypes(df, dtypes)) + tm.assert_frame_equal(df, dc.has_dtypes(items=dtypes)(_noop)(df)) + + with pytest.raises(AssertionError): + ck.has_dtypes(df, {'A': pat.is_float_dtype}) + + with pytest.raises(AssertionError): + dc.has_dtypes(items={'A': pat.is_bool_dtype})(_noop)(df) + def test_one_to_many(): df = pd.DataFrame({ 'parameter': ['Cu', 'Cu', 'Pb', 'Pb'], From 38d1d136b43f320fdd8da14fd458d4092c141418 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 6 Jun 2017 22:55:55 +0100 Subject: [PATCH 7/8] Update docs. --- engarde/checks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/engarde/checks.py b/engarde/checks.py index e28f962..37c0ad3 100644 --- a/engarde/checks.py +++ b/engarde/checks.py @@ -198,7 +198,7 @@ def within_n_std(df, n=3): def has_dtypes(df, items): """ - Assert that a DataFrame has ``dtypes``. + Assert that a DataFrame has ``dtypes`` as described in ``items``. Parameters ========== @@ -206,11 +206,13 @@ def has_dtypes(df, items): items: dict A mapping of column names to: - dtypes, and/or - - functions (**not** other callables!) that take a pandas.Series.dtype instance as input, and - return ``True`` if the ``dtype`` has the correct dtype ``False`` otherwise. + - functions (but **not** other callables!) that take a pandas.Series.dtype instance as input, and + return ``True`` if the ``dtype`` is of the correct dtype and ``False`` otherwise. + Returns ======= df : DataFrame + """ from types import FunctionType from pandas.api.types import is_dtype_equal From 2d85bbef66beed8bc03c878b813867345b51af4b Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 6 Jun 2017 23:19:35 +0100 Subject: [PATCH 8/8] Update docs for has_dtypes with an example --- engarde/checks.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/engarde/checks.py b/engarde/checks.py index 37c0ad3..ab9c461 100644 --- a/engarde/checks.py +++ b/engarde/checks.py @@ -213,6 +213,20 @@ def has_dtypes(df, items): ======= df : DataFrame + Examples + ========= + + .. code:: python + + import numpy as np + import pandas as pd + import engarde.checks as ck + + df = pd.DataFrame({'A': np.random.randint(0, 10, 10), + 'B': np.random.randn(10)}) + df = df.pipe(ck.has_dtypes, items={'A': np.int32, + 'B': pd.api.types.is_float_dtype}) + """ from types import FunctionType from pandas.api.types import is_dtype_equal