diff --git a/doc/source/_static/whatsnew_assign.png b/doc/source/_static/whatsnew_assign.png new file mode 100644 index 0000000000000..0e39e161dc606 Binary files /dev/null and b/doc/source/_static/whatsnew_assign.png differ diff --git a/doc/source/basics.rst b/doc/source/basics.rst index dc43c1177f8c3..8e78ac597479b 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -11,6 +11,7 @@ from pandas.compat import lrange options.display.max_rows=15 + ============================== Essential Basic Functionality ============================== @@ -793,6 +794,7 @@ This is equivalent to the following result result.loc[:,:,'ItemA'] + .. _basics.reindexing: diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 44321375d31a2..6eb13ce722fff 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -450,6 +450,82 @@ available to insert at a particular location in the columns: df.insert(1, 'bar', df['one']) df +.. _dsintro.chained_assignment: + +Assigning New Columns in Method Chains +-------------------------------------- + +.. versionadded:: 0.16.0 + +Inspired by `dplyr's +`__ +``mutate`` verb, DataFrame has an :meth:`~pandas.DataFrame.assign` +method that allows you to easily create new columns that are potentially +derived from existing columns. + +.. ipython:: python + + iris = read_csv('data/iris.data') + iris.head() + + (iris.assign(sepal_ratio = iris['SepalWidth'] / iris['SepalLength']) + .head()) + +Above was an example of inserting a precomputed value. We can also pass in +a function of one argument to be evalutated on the DataFrame being assigned to. + +.. ipython:: python + + iris.assign(sepal_ratio = lambda x: (x['SepalWidth'] / + x['SepalLength'])).head() + +``assign`` **always** returns a copy of the data, leaving the original +DataFrame untouched. + +Passing a callable, as opposed to an actual value to be inserted, is +useful when you don't have a reference to the DataFrame at hand. This is +common when using ``assign`` in chains of operations. For example, +we can limit the DataFrame to just those observations with a Sepal Length +greater than 5, calculate the ratio, and plot: + +.. ipython:: python + + @savefig basics_assign.png + (iris.query('SepalLength > 5') + .assign(SepalRatio = lambda x: x.SepalWidth / x.SepalLength, + PetalRatio = lambda x: x.PetalWidth / x.PetalLength) + .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) + +Since a function is passed in, the function is computed on the DataFrame +being assigned to. Importantly, this is the DataFrame that's been filtered +to those rows with sepal length greater than 5. The filtering happens first, +and then the ratio calculations. This is an example where we didn't +have a reference to the *filtered* DataFrame available. + +The function signature for ``assign`` is simply ``**kwargs``. The keys +are the column names for the new fields, and the values are either a value +to be inserted (for example, a ``Series`` or NumPy array), or a function +of one argument to be called on the ``DataFrame``. A *copy* of the original +DataFrame is returned, with the new values inserted. + +.. warning:: + + Since the function signature of ``assign`` is ``**kwargs``, a dictionary, + the order of the new columns in the resulting DataFrame cannot be guaranteed. + + All expressions are computed first, and then assigned. So you can't refer + to another column being assigned in the same call to ``assign``. For example: + + .. ipython:: + :verbatim: + + In [1]: # Don't do this, bad reference to `C` + df.assign(C = lambda x: x['A'] + x['B'], + D = lambda x: x['A'] + x['C']) + In [2]: # Instead, break it into two assigns + (df.assign(C = lambda x: x['A'] + x['B']) + .assign(D = lambda x: x['A'] + x['C'])) + Indexing / Selection ~~~~~~~~~~~~~~~~~~~~ The basics of indexing are as follows: diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index ead3c79430bf9..b9c358f24f460 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -29,6 +29,47 @@ New features This method is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods. +- DataFrame assign method + +Inspired by `dplyr's +`__ ``mutate`` verb, DataFrame has a new +:meth:`~pandas.DataFrame.assign` method. +The function signature for ``assign`` is simply ``**kwargs``. The keys +are the column names for the new fields, and the values are either a value +to be inserted (for example, a ``Series`` or NumPy array), or a function +of one argument to be called on the ``DataFrame``. The new values are inserted, +and the entire DataFrame (with all original and new columns) is returned. + +.. ipython :: python + + iris = read_csv('data/iris.data') + iris.head() + + iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength']).head() + +Above was an example of inserting a precomputed value. We can also pass in +a function to be evalutated. + +.. ipython :: python + + iris.assign(sepal_ratio = lambda x: (x['SepalWidth'] / + x['SepalLength'])).head() + +The power of ``assign`` comes when used in chains of operations. For example, +we can limit the DataFrame to just those with a Sepal Length greater than 5, +calculate the ratio, and plot + +.. ipython:: python + + (iris.query('SepalLength > 5') + .assign(SepalRatio = lambda x: x.SepalWidth / x.SepalLength, + PetalRatio = lambda x: x.PetalWidth / x.PetalLength) + .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) + +.. image:: _static/whatsnew_assign.png + +See the :ref:`documentation ` for more. (:issue:`9229`) + .. _whatsnew_0160.api: .. _whatsnew_0160.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d64353db8cda6..97e3560e3fcb1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2220,6 +2220,88 @@ def insert(self, loc, column, value, allow_duplicates=False): self._data.insert( loc, column, value, allow_duplicates=allow_duplicates) + def assign(self, **kwargs): + """ + Assign new columns to a DataFrame, returning a new object + (a copy) with all the original columns in addition to the new ones. + + .. versionadded:: 0.16.0 + + Parameters + ---------- + kwargs : keyword, value pairs + keywords are the column names. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. If the values are + not callable, (e.g. a Series, scalar, or array), + they are simply assigned. + + Returns + ------- + df : DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. + + Notes + ----- + Since ``kwargs`` is a dictionary, the order of your + arguments may not be preserved, and so the order of the + new columns is not well defined. Assigning multiple + columns within the same ``assign`` is possible, but you cannot + reference other columns created within the same ``assign`` call. + + Examples + -------- + >>> df = DataFrame({'A': range(1, 11), 'B': np.random.randn(10)}) + + Where the value is a callable, evaluated on `df`: + + >>> df.assign(ln_A = lambda x: np.log(x.A)) + A B ln_A + 0 1 0.426905 0.000000 + 1 2 -0.780949 0.693147 + 2 3 -0.418711 1.098612 + 3 4 -0.269708 1.386294 + 4 5 -0.274002 1.609438 + 5 6 -0.500792 1.791759 + 6 7 1.649697 1.945910 + 7 8 -1.495604 2.079442 + 8 9 0.549296 2.197225 + 9 10 -0.758542 2.302585 + + Where the value already exists and is inserted: + + >>> newcol = np.log(df['A']) + >>> df.assign(ln_A=newcol) + A B ln_A + 0 1 0.426905 0.000000 + 1 2 -0.780949 0.693147 + 2 3 -0.418711 1.098612 + 3 4 -0.269708 1.386294 + 4 5 -0.274002 1.609438 + 5 6 -0.500792 1.791759 + 6 7 1.649697 1.945910 + 7 8 -1.495604 2.079442 + 8 9 0.549296 2.197225 + 9 10 -0.758542 2.302585 + """ + data = self.copy() + + # do all calculations first... + results = {} + for k, v in kwargs.items(): + + if callable(v): + results[k] = v(data) + else: + results[k] = v + + # ... and then assign + for k, v in results.items(): + data[k] = v + + return data + def _sanitize_column(self, key, value): # Need to make sure new columns (which go into the BlockManager as new # blocks) are always copied diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 9ec890a1d1856..f7c91501b683b 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -13965,6 +13965,60 @@ def test_select_dtypes_bad_arg_raises(self): with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'): df.select_dtypes(['blargy, blarg, blarg']) + def test_assign(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + original = df.copy() + result = df.assign(C=df.B / df.A) + expected = df.copy() + expected['C'] = [4, 2.5, 2] + assert_frame_equal(result, expected) + + # lambda syntax + result = df.assign(C=lambda x: x.B / x.A) + assert_frame_equal(result, expected) + + # original is unmodified + assert_frame_equal(df, original) + + # Non-Series array-like + result = df.assign(C=[4, 2.5, 2]) + assert_frame_equal(result, expected) + # original is unmodified + assert_frame_equal(df, original) + + result = df.assign(B=df.B / df.A) + expected = expected.drop('B', axis=1).rename(columns={'C': 'B'}) + assert_frame_equal(result, expected) + + # overwrite + result = df.assign(A=df.A + df.B) + expected = df.copy() + expected['A'] = [5, 7, 9] + assert_frame_equal(result, expected) + + # lambda + result = df.assign(A=lambda x: x.A + x.B) + assert_frame_equal(result, expected) + + def test_assign_multiple(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) + expected = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9], + 'D': [1, 2, 3], 'E': [4, 5, 6]}) + # column order isn't preserved + assert_frame_equal(result.reindex_like(expected), expected) + + def test_assign_bad(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + # non-keyword argument + with tm.assertRaises(TypeError): + df.assign(lambda x: x.A) + with tm.assertRaises(AttributeError): + df.assign(C=df.A, D=df.A + df.C) + with tm.assertRaises(KeyError): + df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) + with tm.assertRaises(KeyError): + df.assign(C=df.A, D=lambda x: x['A'] + x['C']) def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr':