From 356fa4a50509714c0e2d7682ec03660dbf917d53 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 May 2015 08:24:04 -0500
Subject: [PATCH 1/2] ENH: this is a pipe

---
 doc/source/basics.rst           | 74 +++++++++++++++++++++++++++++++++
 doc/source/faq.rst              | 40 ------------------
 doc/source/internals.rst        |  2 +-
 doc/source/whatsnew/v0.16.2.txt | 57 +++++++++++++++++++++++++
 doc/source/whatsnew/v0.17.0.txt |  1 +
 pandas/__init__.py              |  1 -
 pandas/core/generic.py          | 59 ++++++++++++++++++++++++++
 pandas/tests/test_generic.py    | 42 +++++++++++++++++++
 8 files changed, 234 insertions(+), 42 deletions(-)

diff --git a/doc/source/basics.rst b/doc/source/basics.rst
index d16feb3a6c448..349e7e25fdafb 100644
--- a/doc/source/basics.rst
+++ b/doc/source/basics.rst
@@ -624,6 +624,77 @@ We can also pass infinite values to define the bins:
 Function application
 --------------------
 
+To apply your own or another library's functions to pandas objects,
+you should be aware of the three methods below. The appropriate
+method to use depends on whether your function expects to operate
+on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise.
+
+1. `Tablewise Function Application`_: :meth:`~DataFrame.pipe`
+2. `Row or Column-wise Function Application`_: :meth:`~DataFrame.apply`
+3. Elementwise_ function application: :meth:`~DataFrame.applymap`
+
+.. _basics.pipe:
+
+Tablewise Function Application
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.16.2
+
+``DataFrames`` and ``Series`` can of course just be passed into functions.
+However, if the function needs to be called in a chain, consider using the :meth:`~DataFrame.pipe` method.
+Compare the following
+
+.. code-block:: python
+
+   # f, g, and h are functions taking and returning ``DataFrames``
+   >>> f(g(h(df), arg1=1), arg2=2, arg3=3)
+
+with the equivalent
+
+.. code-block:: python
+
+   >>> (df.pipe(h)
+          .pipe(g, arg1=1)
+          .pipe(f, arg2=2, arg3=3)
+       )
+
+Pandas encourages the second style, which is known as method chaining.
+``pipe`` makes it easy to use your own or another library's functions
+in method chains, alongside pandas' methods.
+
+In the example above, the functions ``f``, ``g``, and ``h`` each expected the ``DataFrame`` as the first positional argument.
+What if the function you wish to apply takes its data as, say, the second argument?
+In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``.
+``.pipe`` will route the ``DataFrame`` to the argument specified in the tuple.
+
+For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.poisson, 'data')`` to ``pipe``:
+
+.. ipython:: python
+
+   import statsmodels.formula.api as sm
+
+   bb = pd.read_csv('data/baseball.csv', index_col='id')
+
+   (bb.query('h > 0')
+      .assign(ln_h = lambda df: np.log(df.h))
+      .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)')
+      .fit()
+      .summary()
+   )
+
+The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which
+have introduced the popular ``(%>%)`` (read pipe) operator for R_.
+The implementation of ``pipe`` here is quite clean and feels right at home in python.
+We encourage you to view the source code (``pd.DataFrame.pipe??`` in IPython).
+
+.. _dplyr: https://github.com/hadley/dplyr
+.. _magrittr: https://github.com/smbache/magrittr
+.. _R: http://www.r-project.org
+
+
+Row or Column-wise Function Application
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 Arbitrary functions can be applied along the axes of a DataFrame or Panel
 using the :meth:`~DataFrame.apply` method, which, like the descriptive
 statistics methods, take an optional ``axis`` argument:
@@ -678,6 +749,7 @@ Series operation on each column or row:
    tsdf
    tsdf.apply(pd.Series.interpolate)
 
+
 Finally, :meth:`~DataFrame.apply` takes an argument ``raw`` which is False by default, which
 converts each row or column into a Series before applying the function. When
 set to True, the passed function will instead receive an ndarray object, which
@@ -690,6 +762,8 @@ functionality.
    functionality for grouping by some criterion, applying, and combining the
    results into a Series, DataFrame, etc.
 
+.. _Elementwise:
+
 Applying elementwise Python functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/faq.rst b/doc/source/faq.rst
index 1fc8488e92fde..32290839ad71d 100644
--- a/doc/source/faq.rst
+++ b/doc/source/faq.rst
@@ -89,46 +89,6 @@ representation; i.e., 1KB = 1024 bytes).
 
 See also :ref:`Categorical Memory Usage <categorical.memory>`.
 
-.. _ref-monkey-patching:
-
-Adding Features to your pandas Installation
--------------------------------------------
-
-pandas is a powerful tool and already has a plethora of data manipulation
-operations implemented, most of them are very fast as well.
-It's very possible however that certain functionality that would make your
-life easier is missing. In that case you have several options:
-
-1) Open an issue on `Github <https://github.com/pydata/pandas/issues/>`__ , explain your need and the sort of functionality you would like to see implemented.
-2) Fork the repo, Implement the functionality yourself and open a PR
-   on Github.
-3) Write a method that performs the operation you are interested in and
-   Monkey-patch the pandas class as part of your IPython profile startup
-   or PYTHONSTARTUP file.
-
-   For example, here is an example of adding an ``just_foo_cols()``
-   method to the dataframe class:
-
-::
-
-   import pandas as pd
-   def just_foo_cols(self):
-       """Get a list of column names containing the string 'foo'
-
-       """
-       return [x for x in self.columns if 'foo' in x]
-
-   pd.DataFrame.just_foo_cols = just_foo_cols # monkey-patch the DataFrame class
-   df = pd.DataFrame([list(range(4))], columns=["A","foo","foozball","bar"])
-   df.just_foo_cols()
-   del pd.DataFrame.just_foo_cols # you can also remove the new method
-
-
-Monkey-patching is usually frowned upon because it makes your code
-less portable and can cause subtle bugs in some circumstances.
-Monkey-patching existing methods is usually a bad idea in that respect.
-When used with proper care, however, it's a very useful tool to have.
-
 
 .. _ref-scikits-migration:
 
diff --git a/doc/source/internals.rst b/doc/source/internals.rst
index 17be04cd64d27..8b4f7360fc235 100644
--- a/doc/source/internals.rst
+++ b/doc/source/internals.rst
@@ -101,7 +101,7 @@ Subclassing pandas Data Structures
 
 .. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures.
 
-  1. Monkey-patching: See :ref:`Adding Features to your pandas Installation <ref-monkey-patching>`.
+  1. Extensible method chains with :ref:`pipe <basics.pipe>`
 
   2. Use *composition*. See `here <http://en.wikipedia.org/wiki/Composition_over_inheritance>`_.
 
diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt
index 627c79f7289b7..9421ab0f841ac 100644
--- a/doc/source/whatsnew/v0.16.2.txt
+++ b/doc/source/whatsnew/v0.16.2.txt
@@ -10,6 +10,7 @@ We recommend that all users upgrade to this version.
 Highlights include:
 
 - Documentation on how to use ``numba`` with *pandas*, see :ref:`here <enhancingperf.numba>`
+- A new ``pipe`` method, see :ref:`here <whatsnew_0162.enhancements.pipe>`
 
 Check the :ref:`API Changes <whatsnew_0162.api>` before updating.
 
@@ -22,6 +23,62 @@ Check the :ref:`API Changes <whatsnew_0162.api>` before updating.
 New features
 ~~~~~~~~~~~~
 
+.. _whatsnew_0162.enhancements.pipe:
+
+Pipe
+^^^^
+
+We've introduced a new method :meth:`DataFrame.pipe`. As suggested by the name, ``pipe``
+should be used to pipe data through a chain of function calls.
+The goal is to avoid confusing nested function calls like
+
+  .. code-block:: python
+
+     # df is a DataFrame
+     # f, g, and h are functions that take and return DataFrames
+     f(g(h(df), arg1=1), arg2=2, arg3=3)
+
+The logic flows from inside out, and function names are separated from their keyword arguments.
+This can be rewritten as
+
+  .. code-block:: python
+
+     (df.pipe(h)
+        .pipe(g, arg1=1)
+        .pipe(f, arg2=2)
+     )
+
+Now both the code and the logic flow from top to bottom. Keyword arguments are next to
+their functions. Overall the code is much more readable.
+
+In the example above, the functions ``f``, ``g``, and ``h`` each expected the DataFrame as the first positional argument.
+When the function you wish to apply takes its data anywhere other than the first argument, pass a tuple
+of ``(function, keyword)`` indicating where the DataFrame should flow. For example:
+
+.. ipython:: python
+
+   import statsmodels.formula.api as sm
+
+   bb = pd.read_csv('data/baseball.csv', index_col='id')
+
+   # sm.poisson takes (formula, data)
+   (bb.query('h > 0')
+      .assign(ln_h = lambda df: np.log(df.h))
+      .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)')
+      .fit()
+      .summary()
+   )
+
+The pipe method is inspired by unix pipes, which stream text through
+processes. More recently dplyr_ and magrittr_ have introduced the
+popular ``(%>%)`` pipe operator for R_.
+
+See the :ref:`documentation <basics.pipe>` for more. (:issue:`10129`)
+
+.. _dplyr: https://github.com/hadley/dplyr
+.. _magrittr: https://github.com/smbache/magrittr
+.. _R: http://www.r-project.org
+
 .. _whatsnew_0162.enhancements.other:
 
 Other enhancements
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
index 87a9d197bd0d1..164ab73def894 100644
--- a/doc/source/whatsnew/v0.17.0.txt
+++ b/doc/source/whatsnew/v0.17.0.txt
@@ -21,6 +21,7 @@ Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsne
 New features
 ~~~~~~~~~~~~
 
+
 .. _whatsnew_0170.enhancements.other:
 
 Other enhancements
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 2a142a6ff2072..0e7bc628fdb6a 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -57,4 +57,3 @@
 from pandas.util.print_versions import show_versions
 import pandas.util.testing
 
-
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 3bf90aaf71849..6bcad335a2307 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2045,6 +2045,65 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
         locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
         return self.take(locs, axis=axis)
 
+    _shared_docs['pipe'] = ("""
+        Apply func(self, *args, **kwargs)
+
+        .. versionadded:: 0.16.2
+
+        Parameters
+        ----------
+        func : function
+            function to apply to the %(klass)s.
+            ``args``, and ``kwargs`` are passed into ``func``.
+            Alternatively a ``(callable, data_keyword)`` tuple where
+            ``data_keyword`` is a string indicating the keyword of
+            ``callable`` that expects the %(klass)s.
+        args : positional arguments passed into ``func``.
+        kwargs : a dictionary of keyword arguments passed into ``func``.
+
+        Returns
+        -------
+        object : the return type of ``func``.
+
+        Notes
+        -----
+
+        Use ``.pipe`` when chaining together functions that expect
+        on Series or DataFrames. Instead of writing
+
+        >>> f(g(h(df), arg1=a), arg2=b, arg3=c)
+
+        You can write
+
+        >>> (df.pipe(h)
+        ...    .pipe(g, arg1=a)
+        ...    .pipe(f, arg2=b, arg3=c)
+        ... )
+
+        If you have a function that takes the data as (say) the second
+        argument, pass a tuple indicating which keyword expects the
+        data. For example, suppose ``f`` takes its data as ``arg2``:
+
+        >>> (df.pipe(h)
+        ...    .pipe(g, arg1=a)
+        ...    .pipe((f, 'arg2'), arg1=a, arg3=c)
+        ...  )
+
+        See Also
+        --------
+        pandas.DataFrame.apply
+        pandas.DataFrame.applymap
+        pandas.Series.map
+    """
+    )
+    @Appender(_shared_docs['pipe'] % _shared_doc_kwargs)
+    def pipe(self, func, *args, **kwargs):
+        if isinstance(func, tuple):
+            func, target = func
+            kwargs[target] = self
+            return func(*args, **kwargs)
+        else:
+            return func(self, *args, **kwargs)
 
     #----------------------------------------------------------------------
     # Attribute access
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
index a03fe3c2241a3..44f7791b7f8ba 100644
--- a/pandas/tests/test_generic.py
+++ b/pandas/tests/test_generic.py
@@ -1649,6 +1649,48 @@ def test_describe_raises(self):
         with tm.assertRaises(NotImplementedError):
             tm.makePanel().describe()
 
+    def test_pipe(self):
+        df = DataFrame({'A': [1, 2, 3]})
+        f = lambda x, y: x ** y
+        result = df.pipe(f, 2)
+        expected = DataFrame({'A': [1, 4, 9]})
+        self.assert_frame_equal(result, expected)
+
+        result = df.A.pipe(f, 2)
+        self.assert_series_equal(result, expected.A)
+
+    def test_pipe_tuple(self):
+        df = DataFrame({'A': [1, 2, 3]})
+        f = lambda x, y: y
+        result = df.pipe((f, 'y'), 0)
+        self.assert_frame_equal(result, df)
+
+        result = df.A.pipe((f, 'y'), 0)
+        self.assert_series_equal(result, df.A)
+
+    def test_pipe_tuple_error(self):
+        df = DataFrame({"A": [1, 2, 3]})
+        f = lambda x, y: y
+        with tm.assertRaises(ValueError):
+            result = df.pipe((f, 'y'), x=1, y=0)
+
+        with tm.assertRaises(ValueError):
+            result = df.A.pipe((f, 'y'), x=1, y=0)
+
+    def test_pipe_panel(self):
+        wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})})
+        f = lambda x, y: x + y
+        result = wp.pipe(f, 2)
+        expected = wp + 2
+        assert_panel_equal(result, expected)
+
+        result = wp.pipe((f, 'y'), x=1)
+        expected = wp + 1
+        assert_panel_equal(result, expected)
+
+        with tm.assertRaises(ValueError):
+            result = wp.pipe((f, 'y'), x=1, y=1)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)

From 0c3bf51befbbc14b22cb28ac9ee1e9406b3790ff Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 3 Jun 2015 20:51:20 -0500
Subject: [PATCH 2/2] API: catch target kwarg clobbering

---
 pandas/core/generic.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 6bcad335a2307..0b6476950333e 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2100,6 +2100,9 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
     def pipe(self, func, *args, **kwargs):
         if isinstance(func, tuple):
             func, target = func
+            if target in kwargs:
+                msg = '%s is both the pipe target and a keyword argument' % target
+                raise ValueError(msg)
             kwargs[target] = self
             return func(*args, **kwargs)
         else: