Skip to content

Commit 18ee145

Browse files
committed
ENH: GH9746 DataFrame.unstack and Series.unstack now take fill_value kw for filling NaN when unstack results in a sparse DataFrame
1 parent 4fde946 commit 18ee145

File tree

6 files changed

+65
-11
lines changed

6 files changed

+65
-11
lines changed

doc/source/reshaping.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,22 @@ which level in the columns to stack:
228228
df2.stack('exp')
229229
df2.stack('animal')
230230
231+
Unstacking can result in missing values if subgroups do not have the same
232+
set of labels. By default, missing values will be replaced with NaN.
233+
234+
.. ipython:: python
235+
236+
df3 = df.ix[[0, 1, 4, 7], [1, 2]]
237+
df3
238+
df3.unstack()
239+
240+
Alternatively, unstack takes an optional ``fill_value`` argument, for specifying
241+
the value of missing data.
242+
243+
.. ipython:: python
244+
245+
df3.unstack(fill_value=-1e9)
246+
231247
With a MultiIndex
232248
~~~~~~~~~~~~~~~~~
233249

doc/source/whatsnew/v0.17.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ Other API Changes
3939
^^^^^^^^^^^^^^^^^
4040

4141
- ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in constructor. (:issue:`102171`)
42+
- ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit, specifying ``fill_value`` will preserve the data type of the original stacked data.
4243

4344
.. _whatsnew_0170.deprecations:
4445

pandas/core/frame.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3545,7 +3545,7 @@ def stack(self, level=-1, dropna=True):
35453545
else:
35463546
return stack(self, level, dropna=dropna)
35473547

3548-
def unstack(self, level=-1):
3548+
def unstack(self, level=-1, fill_value=None):
35493549
"""
35503550
Pivot a level of the (necessarily hierarchical) index labels, returning
35513551
a DataFrame having a new level of column labels whose inner-most level
@@ -3558,6 +3558,8 @@ def unstack(self, level=-1):
35583558
----------
35593559
level : int, string, or list of these, default -1 (last level)
35603560
Level(s) of index to unstack, can pass level name
3561+
fill_value : replace NaN with this value if the unstack produces
3562+
missing values
35613563
35623564
See also
35633565
--------
@@ -3599,7 +3601,7 @@ def unstack(self, level=-1):
35993601
unstacked : DataFrame or Series
36003602
"""
36013603
from pandas.core.reshape import unstack
3602-
return unstack(self, level)
3604+
return unstack(self, level, fill_value)
36033605

36043606
#----------------------------------------------------------------------
36053607
# Time series-related

pandas/core/reshape.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class _Unstacker(object):
6161
unstacked : DataFrame
6262
"""
6363

64-
def __init__(self, values, index, level=-1, value_columns=None):
64+
def __init__(self, values, index, level=-1, value_columns=None, fill_value=None):
6565

6666
self.is_categorical = None
6767
if values.ndim == 1:
@@ -71,6 +71,7 @@ def __init__(self, values, index, level=-1, value_columns=None):
7171
values = values[:, np.newaxis]
7272
self.values = values
7373
self.value_columns = value_columns
74+
self.fill_value = fill_value
7475

7576
if value_columns is None and values.shape[1] != 1: # pragma: no cover
7677
raise ValueError('must pass column labels for multi-column data')
@@ -179,6 +180,10 @@ def get_new_values(self):
179180
if self.mask.all():
180181
dtype = values.dtype
181182
new_values = np.empty(result_shape, dtype=dtype)
183+
elif self.fill_value is not None:
184+
dtype = values.dtype
185+
new_values = np.empty(result_shape, dtype=dtype)
186+
new_values.fill(self.fill_value)
182187
else:
183188
dtype, fill_value = _maybe_promote(values.dtype)
184189
new_values = np.empty(result_shape, dtype=dtype)
@@ -389,21 +394,21 @@ def _slow_pivot(index, columns, values):
389394
return DataFrame(tree)
390395

391396

392-
def unstack(obj, level):
397+
def unstack(obj, level, fill_value=None):
393398
if isinstance(level, (tuple, list)):
394399
return _unstack_multiple(obj, level)
395400

396401
if isinstance(obj, DataFrame):
397402
if isinstance(obj.index, MultiIndex):
398-
return _unstack_frame(obj, level)
403+
return _unstack_frame(obj, level, fill_value=fill_value)
399404
else:
400405
return obj.T.stack(dropna=False)
401406
else:
402-
unstacker = _Unstacker(obj.values, obj.index, level=level)
407+
unstacker = _Unstacker(obj.values, obj.index, level=level, fill_value=fill_value)
403408
return unstacker.get_result()
404409

405410

406-
def _unstack_frame(obj, level):
411+
def _unstack_frame(obj, level, fill_value=None):
407412
from pandas.core.internals import BlockManager, make_block
408413

409414
if obj._is_mixed_type:
@@ -419,7 +424,7 @@ def _unstack_frame(obj, level):
419424
for blk in obj._data.blocks:
420425
blk_items = obj._data.items[blk.mgr_locs.indexer]
421426
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
422-
value_columns=blk_items)
427+
value_columns=blk_items, fill_value=fill_value)
423428
new_items = bunstacker.get_new_columns()
424429
new_placement = new_columns.get_indexer(new_items)
425430
new_values, mask = bunstacker.get_new_values()
@@ -435,7 +440,7 @@ def _unstack_frame(obj, level):
435440
return result.ix[:, mask_frame.sum(0) > 0]
436441
else:
437442
unstacker = _Unstacker(obj.values, obj.index, level=level,
438-
value_columns=obj.columns)
443+
value_columns=obj.columns, fill_value=fill_value)
439444
return unstacker.get_result()
440445

441446

pandas/core/series.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1914,7 +1914,7 @@ def reorder_levels(self, order):
19141914
result.index = result.index.reorder_levels(order)
19151915
return result
19161916

1917-
def unstack(self, level=-1):
1917+
def unstack(self, level=-1, fill_value=None):
19181918
"""
19191919
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
19201920
The level involved will automatically get sorted.
@@ -1923,6 +1923,8 @@ def unstack(self, level=-1):
19231923
----------
19241924
level : int, string, or list of these, default last level
19251925
Level(s) to unstack, can pass level name
1926+
fill_value : replace NaN with this value if the unstack produces
1927+
missing values
19261928
19271929
Examples
19281930
--------
@@ -1947,7 +1949,7 @@ def unstack(self, level=-1):
19471949
unstacked : DataFrame
19481950
"""
19491951
from pandas.core.reshape import unstack
1950-
return unstack(self, level)
1952+
return unstack(self, level, fill_value)
19511953

19521954
#----------------------------------------------------------------------
19531955
# function application

pandas/tests/test_frame.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12485,6 +12485,34 @@ def test_stack_unstack(self):
1248512485
assert_frame_equal(unstacked_cols.T, self.frame)
1248612486
assert_frame_equal(unstacked_cols_df['bar'].T, self.frame)
1248712487

12488+
def test_unstack_fill(self):
12489+
12490+
# GH #9746: fill_value keyword argument for Series
12491+
# and DataFrame unstack
12492+
12493+
# From a series
12494+
data = Series([1, 2, 4, 5], dtype=np.int16)
12495+
data.index = MultiIndex.from_tuples(
12496+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
12497+
12498+
result = data.unstack(fill_value=-1)
12499+
expected = DataFrame({'a':[1, -1, 5], 'b':[2, 4, -1]}, index=['x', 'y', 'z'], dtype=np.int16)
12500+
assert_frame_equal(result, expected)
12501+
12502+
# From a dataframe
12503+
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
12504+
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
12505+
df.index = MultiIndex.from_tuples(
12506+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
12507+
12508+
result = df.unstack(fill_value=-1)
12509+
12510+
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
12511+
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
12512+
expected.columns = MultiIndex.from_tuples(
12513+
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
12514+
assert_frame_equal(result, expected)
12515+
1248812516
def test_stack_ints(self):
1248912517
df = DataFrame(
1249012518
np.random.randn(30, 27),

0 commit comments

Comments
 (0)