Skip to content

Commit af82dd2

Browse files
committed
BUG: 2D ndarray of dtype 'object' is always copied upon construction (pandas-dev#39263)
1 parent edbd450 commit af82dd2

File tree

4 files changed

+46
-30
lines changed

4 files changed

+46
-30
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ Datetimelike
231231
- Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`)
232232
- Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`)
233233
- Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`)
234+
- Bug in :func:`DataFrame` constructor unnecessarily copying 2D object arrays (:issue:`39263`)
234235

235236
Timedelta
236237
^^^^^^^^^

pandas/core/internals/construction.py

Lines changed: 3 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
constructors before passing them to a BlockManager.
44
"""
55
from collections import abc
6+
from itertools import groupby
67
from typing import (
78
TYPE_CHECKING,
89
Any,
@@ -37,7 +38,6 @@
3738
is_integer_dtype,
3839
is_list_like,
3940
is_named_tuple,
40-
is_object_dtype,
4141
)
4242
from pandas.core.dtypes.generic import (
4343
ABCDataFrame,
@@ -59,7 +59,7 @@
5959
)
6060
from pandas.core.internals.managers import (
6161
create_block_manager_from_arrays,
62-
create_block_manager_from_blocks,
62+
create_block_manager_from_array
6363
)
6464

6565
if TYPE_CHECKING:
@@ -232,34 +232,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
232232
)
233233
values = values.T
234234

235-
# if we don't have a dtype specified, then try to convert objects
236-
# on the entire block; this is to convert if we have datetimelike's
237-
# embedded in an object type
238-
if dtype is None and is_object_dtype(values.dtype):
239-
240-
if values.ndim == 2 and values.shape[0] != 1:
241-
# transpose and separate blocks
242-
243-
dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
244-
for n in range(len(dvals_list)):
245-
if isinstance(dvals_list[n], np.ndarray):
246-
dvals_list[n] = dvals_list[n].reshape(1, -1)
247-
248-
from pandas.core.internals.blocks import make_block
249-
250-
# TODO: What about re-joining object columns?
251-
block_values = [
252-
make_block(dvals_list[n], placement=[n], ndim=2)
253-
for n in range(len(dvals_list))
254-
]
255-
256-
else:
257-
datelike_vals = maybe_infer_to_datetimelike(values)
258-
block_values = [datelike_vals]
259-
else:
260-
block_values = [values]
261-
262-
return create_block_manager_from_blocks(block_values, [columns, index])
235+
return create_block_manager_from_array(values, [columns, index], dtype)
263236

264237

265238
def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):

pandas/core/internals/managers.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from collections import defaultdict
44
import itertools
5+
from functools import reduce
56
from typing import (
67
Any,
78
Callable,
@@ -28,6 +29,7 @@
2829
find_common_type,
2930
infer_dtype_from_scalar,
3031
maybe_promote,
32+
maybe_infer_to_datetimelike
3133
)
3234
from pandas.core.dtypes.common import (
3335
DT64NS_DTYPE,
@@ -1679,6 +1681,30 @@ def create_block_manager_from_arrays(
16791681
raise construction_error(len(arrays), arrays[0].shape, axes, e)
16801682

16811683

1684+
def create_block_manager_from_array(
1685+
array, axes: List[Index], dtype: Optional[Dtype] = None
1686+
) -> BlockManager:
1687+
assert isinstance(axes, list)
1688+
assert all(isinstance(x, Index) for x in axes)
1689+
1690+
# ensure we dont have any PandasArrays when we call get_block_type
1691+
# Note: just calling extract_array breaks tests that patch PandasArray._typ.
1692+
array = array if not isinstance(array, ABCPandasArray) else array.to_numpy()
1693+
1694+
maybe_datetime = [maybe_infer_to_datetimelike(instance) for instance in array]
1695+
try:
1696+
if dtype is not None or all(is_dtype_equal(instance.dtype, array.dtype)
1697+
for instance in maybe_datetime):
1698+
blocks = [make_block(array, slice(0, len(axes[0])), dtype=dtype)]
1699+
else:
1700+
blocks = _form_blocks(maybe_datetime, axes[0], axes)
1701+
mgr = BlockManager(blocks, axes)
1702+
mgr._consolidate_inplace()
1703+
return mgr
1704+
except ValueError as e:
1705+
raise construction_error(array.shape[0], array.shape[1:], axes, e)
1706+
1707+
16821708
def construction_error(tot_items, block_shape, axes, e=None):
16831709
""" raise a helpful message about our construction """
16841710
passed = tuple(map(int, [tot_items] + list(block_shape)))
@@ -1706,6 +1732,11 @@ def construction_error(tot_items, block_shape, axes, e=None):
17061732
def _form_blocks(arrays, names: Index, axes) -> List[Block]:
17071733
# put "leftover" items in float bucket, where else?
17081734
# generalize?
1735+
1736+
if len(arrays) != len(names):
1737+
raise ValueError(f"Number of arrays ({len(arrays)}) "
1738+
f"does not match index length ({len(names)})")
1739+
17091740
items_dict: DefaultDict[str, List] = defaultdict(list)
17101741
extra_locs = []
17111742

@@ -1908,6 +1939,8 @@ def _merge_blocks(
19081939

19091940
# TODO: optimization potential in case all mgrs contain slices and
19101941
# combination of those slices is a slice, too.
1942+
for b in blocks:
1943+
print(b.values.base)
19111944
new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
19121945
new_values = np.vstack([b.values for b in blocks])
19131946

pandas/tests/frame/test_constructors.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2267,6 +2267,14 @@ def test_nested_dict_construction(self):
22672267
)
22682268
tm.assert_frame_equal(result, expected)
22692269

2270+
def test_object_array_does_not_copy(self):
2271+
a = np.array(["a", "b"], dtype="object")
2272+
b = np.array([["a", "b"], ["c", "d"]], dtype="object")
2273+
df = DataFrame(a)
2274+
assert np.shares_memory(df.values, a)
2275+
df2 = DataFrame(b)
2276+
assert np.shares_memory(df2.values, b)
2277+
22702278
def test_from_tzaware_object_array(self):
22712279
# GH#26825 2D object array of tzaware timestamps should not raise
22722280
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
@@ -2377,6 +2385,7 @@ def test_from_timestamp_scalar_preserves_nanos(self, constructor):
23772385
def test_from_timedelta64_scalar_object(self, constructor):
23782386

23792387
td = Timedelta(1)
2388+
constructor(td, dtype=object)
23802389
td64 = td.to_timedelta64()
23812390

23822391
obj = constructor(td64, dtype=object)

0 commit comments

Comments
 (0)