Skip to content

Commit b8c6756

Browse files
committed
Merge branch 'wide-to-long' of https://github.com/jseabold/pandas into jseabold-wide-to-long
2 parents 55e624d + 42a8e97 commit b8c6756

File tree

5 files changed

+141
-5
lines changed

5 files changed

+141
-5
lines changed

doc/source/reshaping.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,20 @@ For instance,
218218
melt(cheese, id_vars=['first', 'last'])
219219
melt(cheese, id_vars=['first', 'last'], var_name='quantity')
220220
221+
Another way to transform is to use the ``wide_to_long`` panel data convenience function.
222+
223+
.. ipython:: python
224+
225+
dft = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
226+
"A1980" : {0 : "d", 1 : "e", 2 : "f"},
227+
"B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
228+
"B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
229+
"X" : dict(zip(range(3), np.random.randn(3)))
230+
})
231+
dft["id"] = dft.index
232+
df
233+
pd.wide_to_long(dft, ["A", "B"], i="id", j="year")
234+
221235
Combining with stats and GroupBy
222236
--------------------------------
223237

doc/source/v0.13.0.txt

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Highlights include support for a new index type ``Float64Index``, support for ne
1010
Several experimental features are added, including new ``eval/query`` methods for expression evaluation, support for ``msgpack`` serialization,
1111
and an io interface to Google's ``BigQuery``.
1212

13-
The docs also received a new section, :ref:`Comparison with SQL<compare_with_sql>`, which should
13+
The docs also received a new section, :ref:`Comparison with SQL<compare_with_sql>`, which should
1414
be useful for those familiar with SQL but still learning pandas.
1515

1616
.. warning::
@@ -313,7 +313,7 @@ HDFStore API Changes
313313
os.remove(path)
314314

315315
- the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)``
316-
the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies ``fixed`` format and ``append`` implies
316+
the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies ``fixed`` format and ``append`` implies
317317
``table`` format. This default format can be set as an option by setting ``io.hdf.default_format``.
318318

319319
.. ipython:: python
@@ -618,6 +618,23 @@ Enhancements
618618
ser = Series([1, 3, np.nan, np.nan, np.nan, 11])
619619
ser.interpolate(limit=2)
620620

621+
- Added ``wide_to_long`` panel data convenience function.
622+
623+
.. ipython:: python
624+
625+
import pandas as pd
626+
import numpy as np
627+
np.random.seed(123)
628+
df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
629+
"A1980" : {0 : "d", 1 : "e", 2 : "f"},
630+
"B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
631+
"B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
632+
"X" : dict(zip(range(3), np.random.randn(3)))
633+
})
634+
df["id"] = df.index
635+
df
636+
wide_to_long(df, ["A", "B"], i="id", j="year")
637+
621638
.. _scipy: http://www.scipy.org
622639
.. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation
623640
.. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
@@ -894,7 +911,7 @@ to unify methods and behaviors. Series formerly subclassed directly from
894911
- added ``ftypes`` method to Series/DataFrame, similar to ``dtypes``, but indicates
895912
if the underlying is sparse/dense (as well as the dtype)
896913
- All ``NDFrame`` objects can now use ``__finalize__()`` to specify various
897-
values to propagate to new objects from an existing one (e.g. ``name`` in ``Series`` will
914+
values to propagate to new objects from an existing one (e.g. ``name`` in ``Series`` will
898915
follow more automatically now)
899916
- Internal type checking is now done via a suite of generated classes, allowing ``isinstance(value, klass)``
900917
without having to directly import the klass, courtesy of @jtratner

pandas/core/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from pandas.core.panel4d import Panel4D
1616
from pandas.core.groupby import groupby
1717
from pandas.core.reshape import (pivot_simple as pivot, get_dummies,
18-
lreshape)
18+
lreshape, wide_to_long)
1919

2020
WidePanel = Panel
2121

pandas/core/reshape.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,89 @@ def lreshape(data, groups, dropna=True, label=None):
786786

787787
return DataFrame(mdata, columns=id_cols + pivot_cols)
788788

789+
def wide_to_long(df, stubnames, i, j):
790+
"""
791+
Wide panel to long format. Less flexible but more user-friendly than melt.
792+
793+
Parameters
794+
----------
795+
df : DataFrame
796+
The wide-format DataFrame
797+
stubnames : list
798+
A list of stub names. The wide format variables are assumed to
799+
start with the stub names.
800+
i : str
801+
The name of the id variable.
802+
j : str
803+
The name of the subobservation variable.
804+
stubend : str
805+
Regex to match for the end of the stubs.
806+
807+
Returns
808+
-------
809+
DataFrame
810+
A DataFrame that contains each stub name as a variable as well as
811+
variables for i and j.
812+
813+
Examples
814+
--------
815+
>>> import pandas as pd
816+
>>> import numpy as np
817+
>>> np.random.seed(123)
818+
>>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
819+
... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
820+
... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
821+
... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
822+
... "X" : dict(zip(range(3), np.random.randn(3)))
823+
... })
824+
>>> df["id"] = df.index
825+
>>> df
826+
A1970 A1980 B1970 B1980 X id
827+
0 a d 2.5 3.2 -1.085631 0
828+
1 b e 1.2 1.3 0.997345 1
829+
2 c f 0.7 0.1 0.282978 2
830+
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
831+
X A B
832+
id year
833+
0 1970 -1.085631 a 2.5
834+
1 1970 0.997345 b 1.2
835+
2 1970 0.282978 c 0.7
836+
0 1980 -1.085631 d 3.2
837+
1 1980 0.997345 e 1.3
838+
2 1980 0.282978 f 0.1
839+
840+
Notes
841+
-----
842+
All extra variables are treated as extra id variables. This simply uses
843+
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
844+
in a typicaly case.
845+
"""
846+
def get_var_names(df, regex):
847+
return df.filter(regex=regex).columns.tolist()
848+
849+
def melt_stub(df, stub, i, j):
850+
varnames = get_var_names(df, "^"+stub)
851+
newdf = melt(df, id_vars=i, value_vars=varnames,
852+
value_name=stub, var_name=j)
853+
newdf_j = newdf[j].str.replace(stub, "")
854+
try:
855+
newdf_j = newdf_j.astype(int)
856+
except ValueError:
857+
pass
858+
newdf[j] = newdf_j
859+
return newdf
860+
861+
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
862+
if i not in id_vars:
863+
id_vars += [i]
864+
865+
stub = stubnames.pop(0)
866+
newdf = melt_stub(df, stub, id_vars, j)
867+
868+
for stub in stubnames:
869+
new = melt_stub(df, stub, id_vars, j)
870+
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
871+
return newdf.set_index([i, j])
789872

790873
def convert_dummies(data, cat_variables, prefix_sep='_'):
791874
"""

pandas/tests/test_reshape.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas.util.testing import assert_frame_equal
1616
from numpy.testing import assert_array_equal
1717

18-
from pandas.core.reshape import melt, convert_dummies, lreshape, get_dummies
18+
from pandas.core.reshape import (melt, convert_dummies, lreshape, get_dummies,
19+
wide_to_long)
1920
import pandas.util.testing as tm
2021
from pandas.compat import StringIO, cPickle, range
2122

@@ -296,6 +297,27 @@ def test_pairs(self):
296297
'wt': ['wt%d' % i for i in range(1, 4)]}
297298
self.assertRaises(ValueError, lreshape, df, spec)
298299

300+
class TestWideToLong(tm.TestCase):
301+
def test_simple(self):
302+
np.random.seed(123)
303+
x = np.random.randn(3)
304+
df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
305+
"A1980" : {0 : "d", 1 : "e", 2 : "f"},
306+
"B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
307+
"B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
308+
"X" : dict(zip(range(3), x))
309+
})
310+
df["id"] = df.index
311+
exp_data = {"X" : x.tolist() + x.tolist(),
312+
"A" : ['a', 'b', 'c', 'd', 'e', 'f'],
313+
"B" : [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
314+
"year" : [1970, 1970, 1970, 1980, 1980, 1980],
315+
"id" : [0, 1, 2, 0, 1, 2]}
316+
exp_frame = DataFrame(exp_data)
317+
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
318+
long_frame = wide_to_long(df, ["A", "B"], i="id", j="year")
319+
tm.assert_frame_equal(long_frame, exp_frame)
320+
299321

300322
if __name__ == '__main__':
301323
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)