Merge branch 'wide-to-long' of https://github.com/jseabold/pandas into jseabold-wide-to-long

jreback · jreback · commit b8c675695a9d · 2013-12-07T09:14:37.000-05:00
diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst
@@ -218,6 +218,20 @@ For instance,
    melt(cheese, id_vars=['first', 'last'])
    melt(cheese, id_vars=['first', 'last'], var_name='quantity')
 
+Another way to transform is to use the ``wide_to_long`` panel data convenience function.
+
+.. ipython:: python
+
+  dft = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
+                      "A1980" : {0 : "d", 1 : "e", 2 : "f"},
+                      "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
+                      "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
+                      "X"     : dict(zip(range(3), np.random.randn(3)))
+                     })
+  dft["id"] = dft.index
+  df
+  pd.wide_to_long(dft, ["A", "B"], i="id", j="year")
+
 Combining with stats and GroupBy
 --------------------------------
 
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -10,7 +10,7 @@ Highlights include support for a new index type ``Float64Index``, support for ne
 Several experimental features are added, including new ``eval/query`` methods for expression evaluation, support for ``msgpack`` serialization,
 and an io interface to Google's ``BigQuery``.
 
-The docs also received a new section, :ref:`Comparison with SQL<compare_with_sql>`, which should 
+The docs also received a new section, :ref:`Comparison with SQL<compare_with_sql>`, which should
 be useful for those familiar with SQL but still learning pandas.
 
 .. warning::
@@ -313,7 +313,7 @@ HDFStore API Changes
      os.remove(path)
 
 - the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)``
-  the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies ``fixed`` format and ``append`` implies 
+  the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies ``fixed`` format and ``append`` implies
   ``table`` format. This default format can be set as an option by setting ``io.hdf.default_format``.
 
   .. ipython:: python
@@ -618,6 +618,23 @@ Enhancements
     ser = Series([1, 3, np.nan, np.nan, np.nan, 11])
     ser.interpolate(limit=2)
 
+- Added ``wide_to_long`` panel data convenience function.
+
+  .. ipython:: python
+
+    import pandas as pd
+    import numpy as np
+    np.random.seed(123)
+    df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
+                       "A1980" : {0 : "d", 1 : "e", 2 : "f"},
+                       "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
+                       "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
+                       "X"     : dict(zip(range(3), np.random.randn(3)))
+                      })
+    df["id"] = df.index
+    df
+    wide_to_long(df, ["A", "B"], i="id", j="year")
+
 .. _scipy: http://www.scipy.org
 .. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation
 .. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
@@ -894,7 +911,7 @@ to unify methods and behaviors. Series formerly subclassed directly from
 - added ``ftypes`` method to Series/DataFrame, similar to ``dtypes``, but indicates
   if the underlying is sparse/dense (as well as the dtype)
 - All ``NDFrame`` objects can now use ``__finalize__()`` to specify various
-  values to propagate to new objects from an existing one (e.g. ``name`` in ``Series`` will 
+  values to propagate to new objects from an existing one (e.g. ``name`` in ``Series`` will
   follow more automatically now)
 - Internal type checking is now done via a suite of generated classes, allowing ``isinstance(value, klass)``
   without having to directly import the klass, courtesy of @jtratner
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -15,7 +15,7 @@
 from pandas.core.panel4d import Panel4D
 from pandas.core.groupby import groupby
 from pandas.core.reshape import (pivot_simple as pivot, get_dummies,
-                                 lreshape)
+                                 lreshape, wide_to_long)
 
 WidePanel = Panel
 
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -786,6 +786,89 @@ def lreshape(data, groups, dropna=True, label=None):
 
     return DataFrame(mdata, columns=id_cols + pivot_cols)
 
+def wide_to_long(df, stubnames, i, j):
+    """
+    Wide panel to long format. Less flexible but more user-friendly than melt.
+
+    Parameters
+    ----------
+    df : DataFrame
+        The wide-format DataFrame
+    stubnames : list
+        A list of stub names. The wide format variables are assumed to
+        start with the stub names.
+    i : str
+        The name of the id variable.
+    j : str
+        The name of the subobservation variable.
+    stubend : str
+        Regex to match for the end of the stubs.
+
+    Returns
+    -------
+    DataFrame
+        A DataFrame that contains each stub name as a variable as well as
+        variables for i and j.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> np.random.seed(123)
+    >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
+    ...                    "A1980" : {0 : "d", 1 : "e", 2 : "f"},
+    ...                    "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
+    ...                    "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
+    ...                    "X"     : dict(zip(range(3), np.random.randn(3)))
+    ...                   })
+    >>> df["id"] = df.index
+    >>> df
+    A1970 A1980  B1970  B1980         X  id
+    0     a     d    2.5    3.2 -1.085631   0
+    1     b     e    1.2    1.3  0.997345   1
+    2     c     f    0.7    0.1  0.282978   2
+    >>> wide_to_long(df, ["A", "B"], i="id", j="year")
+                    X  A    B
+    id year
+    0  1970 -1.085631  a  2.5
+    1  1970  0.997345  b  1.2
+    2  1970  0.282978  c  0.7
+    0  1980 -1.085631  d  3.2
+    1  1980  0.997345  e  1.3
+    2  1980  0.282978  f  0.1
+
+    Notes
+    -----
+    All extra variables are treated as extra id variables. This simply uses
+    `pandas.melt` under the hood, but is hard-coded to "do the right thing"
+    in a typicaly case.
+    """
+    def get_var_names(df, regex):
+        return df.filter(regex=regex).columns.tolist()
+
+    def melt_stub(df, stub, i, j):
+        varnames = get_var_names(df, "^"+stub)
+        newdf = melt(df, id_vars=i, value_vars=varnames,
+                         value_name=stub, var_name=j)
+        newdf_j = newdf[j].str.replace(stub, "")
+        try:
+            newdf_j = newdf_j.astype(int)
+        except ValueError:
+            pass
+        newdf[j] = newdf_j
+        return newdf
+
+    id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
+    if i not in id_vars:
+        id_vars += [i]
+
+    stub = stubnames.pop(0)
+    newdf = melt_stub(df, stub, id_vars, j)
+
+    for stub in stubnames:
+        new = melt_stub(df, stub, id_vars, j)
+        newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
+    return newdf.set_index([i, j])
 
 def convert_dummies(data, cat_variables, prefix_sep='_'):
     """
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
@@ -15,7 +15,8 @@
 from pandas.util.testing import assert_frame_equal
 from numpy.testing import assert_array_equal
 
-from pandas.core.reshape import melt, convert_dummies, lreshape, get_dummies
+from pandas.core.reshape import (melt, convert_dummies, lreshape, get_dummies,
+                                 wide_to_long)
 import pandas.util.testing as tm
 from pandas.compat import StringIO, cPickle, range
 
@@ -296,6 +297,27 @@ def test_pairs(self):
                 'wt': ['wt%d' % i for i in range(1, 4)]}
         self.assertRaises(ValueError, lreshape, df, spec)
 
+class TestWideToLong(tm.TestCase):
+    def test_simple(self):
+        np.random.seed(123)
+        x = np.random.randn(3)
+        df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
+                           "A1980" : {0 : "d", 1 : "e", 2 : "f"},
+                           "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
+                           "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
+                           "X"     : dict(zip(range(3), x))
+                          })
+        df["id"] = df.index
+        exp_data = {"X" : x.tolist() + x.tolist(),
+                    "A" : ['a', 'b', 'c', 'd', 'e', 'f'],
+                    "B" : [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+                    "year" : [1970, 1970, 1970, 1980, 1980, 1980],
+                    "id" : [0, 1, 2, 0, 1, 2]}
+        exp_frame = DataFrame(exp_data)
+        exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
+        long_frame = wide_to_long(df, ["A", "B"], i="id", j="year")
+        tm.assert_frame_equal(long_frame, exp_frame)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],