Skip to content

BUG/REF: unstack with EA dtypes #33356

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 10, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,16 @@ def _is_homogeneous_type(self) -> bool:
else:
return not self._mgr.is_mixed_type

@property
def _can_fast_transpose(self) -> bool:
"""
Can we transpose this DataFrame without creating any new array objects.
"""
if self._data.any_extension_types:
# TODO(EA2D) special case would be unnecessary with 2D EAs
return False
return len(self._data.blocks) == 1

# ----------------------------------------------------------------------
# Rendering Methods

Expand Down
40 changes: 9 additions & 31 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import pandas.core.algorithms as algos
from pandas.core.arrays import SparseArray
from pandas.core.arrays.categorical import factorize_from_iterable
from pandas.core.construction import extract_array
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import Index, MultiIndex
from pandas.core.series import Series
Expand Down Expand Up @@ -413,7 +412,7 @@ def unstack(obj, level, fill_value=None):
level = obj.index._get_level_number(level)

if isinstance(obj, DataFrame):
if isinstance(obj.index, MultiIndex):
if isinstance(obj.index, MultiIndex) or not obj._can_fast_transpose:
return _unstack_frame(obj, level, fill_value=fill_value)
else:
return obj.T.stack(dropna=False)
Expand All @@ -429,14 +428,14 @@ def unstack(obj, level, fill_value=None):


def _unstack_frame(obj, level, fill_value=None):
if obj._is_mixed_type:
if not obj._can_fast_transpose:
unstacker = _Unstacker(obj.index, level=level)
blocks = obj._mgr.unstack(unstacker, fill_value=fill_value)
return obj._constructor(blocks)
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
return obj._constructor(mgr)
else:
return _Unstacker(
obj.index, level=level, constructor=obj._constructor,
).get_result(obj.values, value_columns=obj.columns, fill_value=fill_value)
).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value)


def _unstack_extension_series(series, level, fill_value):
Expand All @@ -462,31 +461,10 @@ def _unstack_extension_series(series, level, fill_value):
Each column of the DataFrame will have the same dtype as
the input Series.
"""
# Implementation note: the basic idea is to
# 1. Do a regular unstack on a dummy array of integers
# 2. Followup with a columnwise take.
# We use the dummy take to discover newly-created missing values
# introduced by the reshape.
from pandas.core.reshape.concat import concat

dummy_arr = np.arange(len(series))
# fill_value=-1, since we will do a series.values.take later
result = _Unstacker(series.index, level=level).get_result(
dummy_arr, value_columns=None, fill_value=-1
)

out = []
values = extract_array(series, extract_numpy=False)

for col, indices in result.items():
out.append(
Series(
values.take(indices.values, allow_fill=True, fill_value=fill_value),
name=col,
index=result.index,
)
)
return concat(out, axis="columns", copy=False, keys=result.columns)
# Defer to the logic in ExtensionBlock._unstack
df = series.to_frame()
result = df.unstack(level=level, fill_value=fill_value)
return result.droplevel(level=0, axis=1)


def stack(frame, level=-1, dropna=True):
Expand Down
14 changes: 13 additions & 1 deletion pandas/tests/extension/base/casting.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,22 @@ class BaseCastingTests(BaseExtensionTests):
"""Casting to and from ExtensionDtypes"""

def test_astype_object_series(self, all_data):
ser = pd.Series({"A": all_data})
ser = pd.Series(all_data, name="A")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, that was a bug.

result = ser.astype(object)
assert isinstance(result._mgr.blocks[0], ObjectBlock)

def test_astype_object_frame(self, all_data):
df = pd.DataFrame({"A": all_data})

result = df.astype(object)
blk = result._data.blocks[0]
assert isinstance(blk, ObjectBlock), type(blk)

# FIXME: these currently fail; dont leave commented-out
# check that we can compare the dtypes
# cmp = result.dtypes.equals(df.dtypes)
# assert not cmp.any()

def test_tolist(self, data):
result = pd.Series(data).tolist()
expected = list(data)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

import pandas as pd
import pandas._testing as tm
from pandas.core.internals import ExtensionBlock

from .base import BaseExtensionTests
Expand Down Expand Up @@ -295,6 +296,14 @@ def test_unstack(self, data, index, obj):
assert all(
isinstance(result[col].array, type(data)) for col in result.columns
)

if obj == "series":
# We should get the same result with to_frame+unstack+droplevel
df = ser.to_frame()

alt = df.unstack(level=level).droplevel(0, axis=1)
tm.assert_frame_equal(result, alt)

expected = ser.astype(object).unstack(level=level)
result = result.astype(object)

Expand Down
22 changes: 21 additions & 1 deletion pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from pandas.errors import PerformanceWarning

from pandas.core.dtypes.common import is_object_dtype

import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
Expand Down Expand Up @@ -309,7 +311,25 @@ def test_searchsorted(self, data_for_sorting, as_series):


class TestCasting(BaseSparseTests, base.BaseCastingTests):
pass
def test_astype_object_series(self, all_data):
# Unlike the base class, we do not expect the resulting Block
# to be ObjectBlock
ser = pd.Series(all_data, name="A")
result = ser.astype(object)
assert is_object_dtype(result._data.blocks[0].dtype)

def test_astype_object_frame(self, all_data):
# Unlike the base class, we do not expect the resulting Block
# to be ObjectBlock
df = pd.DataFrame({"A": all_data})

result = df.astype(object)
assert is_object_dtype(result._data.blocks[0].dtype)

# FIXME: these currently fail; dont leave commented-out
# check that we can compare the dtypes
# comp = result.dtypes.equals(df.dtypes)
# assert not comp.any()


class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests):
Expand Down