From 356fa4a50509714c0e2d7682ec03660dbf917d53 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 May 2015 08:24:04 -0500 Subject: [PATCH 1/2] ENH: this is a pipe --- doc/source/basics.rst | 74 +++++++++++++++++++++++++++++++++ doc/source/faq.rst | 40 ------------------ doc/source/internals.rst | 2 +- doc/source/whatsnew/v0.16.2.txt | 57 +++++++++++++++++++++++++ doc/source/whatsnew/v0.17.0.txt | 1 + pandas/__init__.py | 1 - pandas/core/generic.py | 59 ++++++++++++++++++++++++++ pandas/tests/test_generic.py | 42 +++++++++++++++++++ 8 files changed, 234 insertions(+), 42 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index d16feb3a6c448..349e7e25fdafb 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -624,6 +624,77 @@ We can also pass infinite values to define the bins: Function application -------------------- +To apply your own or another library's functions to pandas objects, +you should be aware of the three methods below. The appropriate +method to use depends on whether your function expects to operate +on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise. + +1. `Tablewise Function Application`_: :meth:`~DataFrame.pipe` +2. `Row or Column-wise Function Application`_: :meth:`~DataFrame.apply` +3. Elementwise_ function application: :meth:`~DataFrame.applymap` + +.. _basics.pipe: + +Tablewise Function Application +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.16.2 + +``DataFrames`` and ``Series`` can of course just be passed into functions. +However, if the function needs to be called in a chain, consider using the :meth:`~DataFrame.pipe` method. +Compare the following + +.. code-block:: python + + # f, g, and h are functions taking and returning ``DataFrames`` + >>> f(g(h(df), arg1=1), arg2=2, arg3=3) + +with the equivalent + +.. code-block:: python + + >>> (df.pipe(h) + .pipe(g, arg1=1) + .pipe(f, arg2=2, arg3=3) + ) + +Pandas encourages the second style, which is known as method chaining. +``pipe`` makes it easy to use your own or another library's functions +in method chains, alongside pandas' methods. + +In the example above, the functions ``f``, ``g``, and ``h`` each expected the ``DataFrame`` as the first positional argument. +What if the function you wish to apply takes its data as, say, the second argument? +In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``. +``.pipe`` will route the ``DataFrame`` to the argument specified in the tuple. + +For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.poisson, 'data')`` to ``pipe``: + +.. ipython:: python + + import statsmodels.formula.api as sm + + bb = pd.read_csv('data/baseball.csv', index_col='id') + + (bb.query('h > 0') + .assign(ln_h = lambda df: np.log(df.h)) + .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)') + .fit() + .summary() + ) + +The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which +have introduced the popular ``(%>%)`` (read pipe) operator for R_. +The implementation of ``pipe`` here is quite clean and feels right at home in python. +We encourage you to view the source code (``pd.DataFrame.pipe??`` in IPython). + +.. _dplyr: https://github.com/hadley/dplyr +.. _magrittr: https://github.com/smbache/magrittr +.. _R: http://www.r-project.org + + +Row or Column-wise Function Application +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Arbitrary functions can be applied along the axes of a DataFrame or Panel using the :meth:`~DataFrame.apply` method, which, like the descriptive statistics methods, take an optional ``axis`` argument: @@ -678,6 +749,7 @@ Series operation on each column or row: tsdf tsdf.apply(pd.Series.interpolate) + Finally, :meth:`~DataFrame.apply` takes an argument ``raw`` which is False by default, which converts each row or column into a Series before applying the function. When set to True, the passed function will instead receive an ndarray object, which @@ -690,6 +762,8 @@ functionality. functionality for grouping by some criterion, applying, and combining the results into a Series, DataFrame, etc. +.. _Elementwise: + Applying elementwise Python functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 1fc8488e92fde..32290839ad71d 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -89,46 +89,6 @@ representation; i.e., 1KB = 1024 bytes). See also :ref:`Categorical Memory Usage `. -.. _ref-monkey-patching: - -Adding Features to your pandas Installation -------------------------------------------- - -pandas is a powerful tool and already has a plethora of data manipulation -operations implemented, most of them are very fast as well. -It's very possible however that certain functionality that would make your -life easier is missing. In that case you have several options: - -1) Open an issue on `Github `__ , explain your need and the sort of functionality you would like to see implemented. -2) Fork the repo, Implement the functionality yourself and open a PR - on Github. -3) Write a method that performs the operation you are interested in and - Monkey-patch the pandas class as part of your IPython profile startup - or PYTHONSTARTUP file. - - For example, here is an example of adding an ``just_foo_cols()`` - method to the dataframe class: - -:: - - import pandas as pd - def just_foo_cols(self): - """Get a list of column names containing the string 'foo' - - """ - return [x for x in self.columns if 'foo' in x] - - pd.DataFrame.just_foo_cols = just_foo_cols # monkey-patch the DataFrame class - df = pd.DataFrame([list(range(4))], columns=["A","foo","foozball","bar"]) - df.just_foo_cols() - del pd.DataFrame.just_foo_cols # you can also remove the new method - - -Monkey-patching is usually frowned upon because it makes your code -less portable and can cause subtle bugs in some circumstances. -Monkey-patching existing methods is usually a bad idea in that respect. -When used with proper care, however, it's a very useful tool to have. - .. _ref-scikits-migration: diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 17be04cd64d27..8b4f7360fc235 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -101,7 +101,7 @@ Subclassing pandas Data Structures .. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. - 1. Monkey-patching: See :ref:`Adding Features to your pandas Installation `. + 1. Extensible method chains with :ref:`pipe ` 2. Use *composition*. See `here `_. diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index 627c79f7289b7..9421ab0f841ac 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -10,6 +10,7 @@ We recommend that all users upgrade to this version. Highlights include: - Documentation on how to use ``numba`` with *pandas*, see :ref:`here ` +- A new ``pipe`` method, see :ref:`here ` Check the :ref:`API Changes ` before updating. @@ -22,6 +23,62 @@ Check the :ref:`API Changes ` before updating. New features ~~~~~~~~~~~~ +.. _whatsnew_0162.enhancements.pipe: + +Pipe +^^^^ + +We've introduced a new method :meth:`DataFrame.pipe`. As suggested by the name, ``pipe`` +should be used to pipe data through a chain of function calls. +The goal is to avoid confusing nested function calls like + + .. code-block:: python + + # df is a DataFrame + # f, g, and h are functions that take and return DataFrames + f(g(h(df), arg1=1), arg2=2, arg3=3) + +The logic flows from inside out, and function names are separated from their keyword arguments. +This can be rewritten as + + .. code-block:: python + + (df.pipe(h) + .pipe(g, arg1=1) + .pipe(f, arg2=2) + ) + +Now both the code and the logic flow from top to bottom. Keyword arguments are next to +their functions. Overall the code is much more readable. + +In the example above, the functions ``f``, ``g``, and ``h`` each expected the DataFrame as the first positional argument. +When the function you wish to apply takes its data anywhere other than the first argument, pass a tuple +of ``(function, keyword)`` indicating where the DataFrame should flow. For example: + +.. ipython:: python + + import statsmodels.formula.api as sm + + bb = pd.read_csv('data/baseball.csv', index_col='id') + + # sm.poisson takes (formula, data) + (bb.query('h > 0') + .assign(ln_h = lambda df: np.log(df.h)) + .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)') + .fit() + .summary() + ) + +The pipe method is inspired by unix pipes, which stream text through +processes. More recently dplyr_ and magrittr_ have introduced the +popular ``(%>%)`` pipe operator for R_. + +See the :ref:`documentation ` for more. (:issue:`10129`) + +.. _dplyr: https://github.com/hadley/dplyr +.. _magrittr: https://github.com/smbache/magrittr +.. _R: http://www.r-project.org + .. _whatsnew_0162.enhancements.other: Other enhancements diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 87a9d197bd0d1..164ab73def894 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -21,6 +21,7 @@ Check the :ref:`API Changes ` and :ref:`deprecations >> f(g(h(df), arg1=a), arg2=b, arg3=c) + + You can write + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe(f, arg2=b, arg3=c) + ... ) + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``f`` takes its data as ``arg2``: + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe((f, 'arg2'), arg1=a, arg3=c) + ... ) + + See Also + -------- + pandas.DataFrame.apply + pandas.DataFrame.applymap + pandas.Series.map + """ + ) + @Appender(_shared_docs['pipe'] % _shared_doc_kwargs) + def pipe(self, func, *args, **kwargs): + if isinstance(func, tuple): + func, target = func + kwargs[target] = self + return func(*args, **kwargs) + else: + return func(self, *args, **kwargs) #---------------------------------------------------------------------- # Attribute access diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index a03fe3c2241a3..44f7791b7f8ba 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1649,6 +1649,48 @@ def test_describe_raises(self): with tm.assertRaises(NotImplementedError): tm.makePanel().describe() + def test_pipe(self): + df = DataFrame({'A': [1, 2, 3]}) + f = lambda x, y: x ** y + result = df.pipe(f, 2) + expected = DataFrame({'A': [1, 4, 9]}) + self.assert_frame_equal(result, expected) + + result = df.A.pipe(f, 2) + self.assert_series_equal(result, expected.A) + + def test_pipe_tuple(self): + df = DataFrame({'A': [1, 2, 3]}) + f = lambda x, y: y + result = df.pipe((f, 'y'), 0) + self.assert_frame_equal(result, df) + + result = df.A.pipe((f, 'y'), 0) + self.assert_series_equal(result, df.A) + + def test_pipe_tuple_error(self): + df = DataFrame({"A": [1, 2, 3]}) + f = lambda x, y: y + with tm.assertRaises(ValueError): + result = df.pipe((f, 'y'), x=1, y=0) + + with tm.assertRaises(ValueError): + result = df.A.pipe((f, 'y'), x=1, y=0) + + def test_pipe_panel(self): + wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})}) + f = lambda x, y: x + y + result = wp.pipe(f, 2) + expected = wp + 2 + assert_panel_equal(result, expected) + + result = wp.pipe((f, 'y'), x=1) + expected = wp + 1 + assert_panel_equal(result, expected) + + with tm.assertRaises(ValueError): + result = wp.pipe((f, 'y'), x=1, y=1) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 0c3bf51befbbc14b22cb28ac9ee1e9406b3790ff Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jun 2015 20:51:20 -0500 Subject: [PATCH 2/2] API: catch target kwarg clobbering --- pandas/core/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6bcad335a2307..0b6476950333e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2100,6 +2100,9 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No def pipe(self, func, *args, **kwargs): if isinstance(func, tuple): func, target = func + if target in kwargs: + msg = '%s is both the pipe target and a keyword argument' % target + raise ValueError(msg) kwargs[target] = self return func(*args, **kwargs) else: