From ff2f2719b2a8766eb87ed93218c42345ed9850a3 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Fri, 22 Jan 2021 01:58:33 +0000 Subject: [PATCH 01/16] BUG: 2D ndarray of dtype 'object' is always copied upon construction (#39263) --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/internals/__init__.py | 2 -- pandas/core/internals/construction.py | 41 ++++++++++--------------- pandas/core/internals/managers.py | 39 ++++++++--------------- pandas/tests/frame/test_constructors.py | 8 +++++ 5 files changed, 39 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7fe0b53d7d2ff..32fede1bf895c 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -280,6 +280,7 @@ Datetimelike - Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`) - Bug in :meth:`Timestamp.round`, :meth:`Timestamp.floor`, :meth:`Timestamp.ceil` for values near the implementation bounds of :class:`Timestamp` (:issue:`39244`) - Bug in :func:`date_range` incorrectly creating :class:`DatetimeIndex` containing ``NaT`` instead of raising ``OutOfBoundsDatetime`` in corner cases (:issue:`24124`) +- Bug in :class:`DataFrame` constructor always copying 2D object arrays (:issue:`39272`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index ff4e186e147d7..ed91d696437d4 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -18,7 +18,6 @@ BlockManager, SingleBlockManager, create_block_manager_from_arrays, - create_block_manager_from_blocks, ) __all__ = [ @@ -40,5 +39,4 @@ "concatenate_block_managers", # those two are preserved here for downstream compatibility (GH-33892) "create_block_manager_from_arrays", - "create_block_manager_from_blocks", ] diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f864f1cddfe7a..9b7d7047d7a34 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -60,8 +60,8 @@ union_indexes, ) from pandas.core.internals.managers import ( + create_block_manager_from_array, create_block_manager_from_arrays, - create_block_manager_from_blocks, ) if TYPE_CHECKING: @@ -230,36 +230,29 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): index, columns = _get_axes( values.shape[0], values.shape[1], index=index, columns=columns ) - values = values.T + + array = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): - - if values.ndim == 2 and values.shape[0] != 1: - # transpose and separate blocks - - dvals_list = [maybe_infer_to_datetimelike(row) for row in values] - for n in range(len(dvals_list)): - if isinstance(dvals_list[n], np.ndarray): - dvals_list[n] = dvals_list[n].reshape(1, -1) - - from pandas.core.internals.blocks import make_block - - # TODO: What about re-joining object columns? - block_values = [ - make_block(dvals_list[n], placement=[n], ndim=2) - for n in range(len(dvals_list)) + if dtype is None and is_object_dtype(array.dtype): + if array.ndim == 2 and array.shape[0] != 1: + maybe_datetime = [ + maybe_infer_to_datetimelike(instance) for instance in array ] - + # don't convert (and copy) the objects if no type inference occurs + if any( + not is_dtype_equal(instance.dtype, array.dtype) + for instance in maybe_datetime + ): + return create_block_manager_from_arrays( + maybe_datetime, columns, [columns, index] + ) else: - datelike_vals = maybe_infer_to_datetimelike(values) - block_values = [datelike_vals] - else: - block_values = [values] + array = maybe_infer_to_datetimelike(array) - return create_block_manager_from_blocks(block_values, [columns, index]) + return create_block_manager_from_array(array, [columns, index]) def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5ed2ca469dd8a..7807f6592b43b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1633,32 +1633,6 @@ def fast_xs(self, loc): # Constructor Helpers -def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: - try: - if len(blocks) == 1 and not isinstance(blocks[0], Block): - # if blocks[0] is of length 0, return empty blocks - if not len(blocks[0]): - blocks = [] - else: - # It's OK if a single block is passed as values, its placement - # is basically "all items", but if there're many, don't bother - # converting, it's an error anyway. - blocks = [ - make_block( - values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 - ) - ] - - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr - - except ValueError as e: - blocks = [getattr(b, "values", b) for b in blocks] - tot_items = sum(b.shape[0] for b in blocks) - raise construction_error(tot_items, blocks[0].shape[1:], axes, e) - - def create_block_manager_from_arrays( arrays, names: Index, axes: List[Index] ) -> BlockManager: @@ -1678,6 +1652,19 @@ def create_block_manager_from_arrays( raise construction_error(len(arrays), arrays[0].shape, axes, e) +def create_block_manager_from_array( + array, + axes: List[Index], +) -> BlockManager: + try: + block = make_block(values=array, placement=slice(0, len(axes[0])), ndim=2) + mgr = BlockManager([block], axes) + mgr._consolidate_inplace() + except ValueError as e: + raise construction_error(array.shape[0], array.shape[1:], axes, e) + return mgr + + def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ passed = tuple(map(int, [tot_items] + list(block_shape))) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f23b8d559a00a..8601fd25edcfd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1909,6 +1909,14 @@ def test_constructor_series_copy(self, float_frame): assert not (series["A"] == 5).all() + def test_object_array_does_not_copy(self): + a = np.array(["a", "b"], dtype="object") + b = np.array([["a", "b"], ["c", "d"]], dtype="object") + df = DataFrame(a) + assert np.shares_memory(df.values, a) + df2 = DataFrame(b) + assert np.shares_memory(df2.values, b) + def test_constructor_with_nas(self): # GH 5016 # na's in indices From 570dbe11caa92f4fa5cfbb09c6ae5fb0741286a0 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Tue, 25 May 2021 19:18:00 +0100 Subject: [PATCH 02/16] lint --- pandas/core/internals/construction.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 87c1dd68e576b..ed478945045fd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -79,13 +79,10 @@ ArrayManager, SingleArrayManager, ) -from pandas.core.internals.blocks import ( - ensure_block_shape, -) from pandas.core.internals.managers import ( - create_block_manager_from_array, BlockManager, SingleBlockManager, + create_block_manager_from_array, create_block_manager_from_arrays, ) @@ -362,8 +359,8 @@ def ndarray_to_mgr( ] # don't convert (and copy) the objects if no type inference occurs if any( - not is_dtype_equal(instance.dtype, array.dtype) - for instance in maybe_datetime + not is_dtype_equal(instance.dtype, array.dtype) + for instance in maybe_datetime ): return create_block_manager_from_arrays( maybe_datetime, columns, [columns, index] From cb59f2a67b9eddff97104f599c682da45822e068 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Tue, 25 May 2021 20:05:29 +0100 Subject: [PATCH 03/16] add back _from_blocks --- pandas/core/internals/__init__.py | 2 ++ pandas/core/internals/managers.py | 16 ++++++++++++++++ pandas/tests/internals/test_api.py | 2 +- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 5c10a3d459cd8..af1350f088b7a 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -19,6 +19,7 @@ BlockManager, SingleBlockManager, create_block_manager_from_arrays, + create_block_manager_from_blocks, ) __all__ = [ @@ -38,6 +39,7 @@ "concatenate_managers", # those two are preserved here for downstream compatibility (GH-33892) "create_block_manager_from_arrays", + "create_block_manager_from_blocks", ] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 051394d6ddab1..7214def333e85 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1710,6 +1710,22 @@ def _equal_values(self: T, other: T) -> bool: # Constructor Helpers +def create_block_manager_from_blocks( + blocks: list[Block], axes: list[Index], consolidate: bool = True +) -> BlockManager: + try: + mgr = BlockManager(blocks, axes) + + except ValueError as err: + arrays = [blk.values for blk in blocks] + tot_items = sum(arr.shape[0] for arr in arrays) + raise construction_error(tot_items, arrays[0].shape[1:], axes, err) + + if consolidate: + mgr._consolidate_inplace() + return mgr + + # We define this here so we can override it in tests.extension.test_numpy def _extract_array(obj): return extract_array(obj, extract_numpy=True) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index fdd9588d747db..21299d76eaf5a 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -40,7 +40,7 @@ def test_namespace(): "SingleArrayManager", "concatenate_managers", "create_block_manager_from_arrays", - "create_block_manager_from_array", + "create_block_manager_from_blocks", ] result = [x for x in dir(internals) if not x.startswith("__")] From 672f9c0be36cd6875e0570895b0d1b9230b4dfae Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Wed, 26 May 2021 18:40:10 +0100 Subject: [PATCH 04/16] try not replacing _from_blocks with _from_array --- pandas/core/internals/construction.py | 45 ++++++++++++++++++--------- pandas/core/internals/managers.py | 20 ------------ 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 8d73d8910f5d3..95997c2a61f89 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -80,11 +80,15 @@ ArrayManager, SingleArrayManager, ) +from pandas.core.internals.blocks import ( + ensure_block_shape, + new_block, +) from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_array, create_block_manager_from_arrays, + create_block_manager_from_blocks, ) if TYPE_CHECKING: @@ -347,29 +351,42 @@ def ndarray_to_mgr( return ArrayManager(arrays, [index, columns], verify_integrity=False) - array = values.T + values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(array.dtype): - if array.ndim == 2 and array.shape[0] != 1: + if dtype is None and is_object_dtype(values.dtype): + + if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks - maybe_datetime = [ - maybe_infer_to_datetimelike(instance) for instance in array - ] + + dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] # don't convert (and copy) the objects if no type inference occurs if any( - not is_dtype_equal(instance.dtype, array.dtype) - for instance in maybe_datetime + not is_dtype_equal(instance.dtype, values.dtype) + for instance in dtlike_vals ): - return create_block_manager_from_arrays( - maybe_datetime, columns, [columns, index] - ) + dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] + block_values = [ + new_block(dvals_list[n], placement=n, ndim=2) + for n in range(len(dvals_list)) + ] + else: + nb = new_block(values, placement=slice(len(columns)), ndim=2) + block_values = [nb] else: - array = maybe_infer_to_datetimelike(array) + datelike_vals = maybe_infer_to_datetimelike(values) + nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) + block_values = [nb] + else: + nb = new_block(values, placement=slice(len(columns)), ndim=2) + block_values = [nb] + + if len(columns) == 0: + block_values = [] - return create_block_manager_from_array(array, [columns, index]) + return create_block_manager_from_blocks(block_values, [columns, index]) def _check_values_indices_shape_match( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7214def333e85..323aa45874d96 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1753,26 +1753,6 @@ def create_block_manager_from_arrays( return mgr -def create_block_manager_from_array( - array, - axes: list[Index], - consolidate: bool = True, -) -> BlockManager: - assert isinstance(axes, list) - assert all(isinstance(x, Index) for x in axes) - - array = _extract_array(array) - - try: - block = new_block(values=array, placement=slice(0, len(axes[0])), ndim=2) - mgr = BlockManager([block], axes) - except ValueError as e: - raise construction_error(array.shape[0], array.shape[1:], axes, e) - if consolidate: - mgr._consolidate_inplace() - return mgr - - def construction_error( tot_items: int, block_shape: Shape, From b6b248abd3a590311ff133d874be7d92accfbdf0 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Mon, 31 May 2021 17:26:54 +0100 Subject: [PATCH 05/16] reduce test to only 2d arr --- pandas/tests/frame/test_constructors.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 686c59873aa42..631376e12e9e6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2064,13 +2064,10 @@ def test_constructor_series_copy(self, float_frame): assert not (series["A"] == 5).all() - def test_object_array_does_not_copy(self): - a = np.array(["a", "b"], dtype="object") - b = np.array([["a", "b"], ["c", "d"]], dtype="object") - df = DataFrame(a) - assert np.shares_memory(df.values, a) - df2 = DataFrame(b) - assert np.shares_memory(df2.values, b) + def test_2d_object_array_does_not_copy(self): + arr = np.array([["a", "b"], ["c", "d"]], dtype="object") + df = DataFrame(arr) + assert np.shares_memory(df.values, arr) def test_constructor_with_nas(self): # GH 5016 From 59597f38741d85649f802bb6a9bd34cd506cb6a6 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Wed, 7 Jul 2021 18:58:11 +0100 Subject: [PATCH 06/16] restore broader test --- pandas/tests/frame/test_constructors.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ece943d73ef48..a863a62c8b623 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2080,10 +2080,13 @@ def test_constructor_series_copy(self, float_frame): assert not (series["A"] == 5).all() - def test_2d_object_array_does_not_copy(self): - arr = np.array([["a", "b"], ["c", "d"]], dtype="object") - df = DataFrame(arr) - assert np.shares_memory(df.values, arr) + def test_object_array_does_not_copy(self): + a = np.array(["a", "b"], dtype="object") + b = np.array([["a", "b"], ["c", "d"]], dtype="object") + df = DataFrame(a) + assert np.shares_memory(df.values, a) + df2 = DataFrame(b) + assert np.shares_memory(df2.values, b) def test_constructor_with_nas(self): # GH 5016 From f632526c1d447d84c01c0aed81f42524fd37019c Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Wed, 7 Jul 2021 18:59:07 +0100 Subject: [PATCH 07/16] move whatsnew to v1.3.1 --- doc/source/whatsnew/v1.3.1.rst | 2 +- doc/source/whatsnew/v1.4.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 9c17a22bf6d52..2bf9024e9676a 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :class:`DataFrame` constructor always copying 2D object arrays (:issue:`39272`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 632192adf8c20..5b09b62fa9e88 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -176,7 +176,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`) -- Bug in :class:`DataFrame` constructor always copying 2D object arrays (:issue:`39272`) +- Timedelta ^^^^^^^^^ From ac71b7d0b1cb9b563cc215ee231e296ab61a3142 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Sat, 10 Jul 2021 02:54:20 +0100 Subject: [PATCH 08/16] split test --- pandas/tests/frame/test_constructors.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a863a62c8b623..a5568dda56c67 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -253,6 +253,18 @@ def test_constructor_dtype_nocast_view_2d_array(self): should_be_view[0][0] = 97 assert df.values[0, 0] == 97 + @td.skip_array_manager_invalid_test + def test_1d_object_array_does_not_copy(self): + arr = np.array(["a", "b"], dtype="object") + df = DataFrame(arr) + assert np.shares_memory(df.values, arr) + + @td.skip_array_manager_invalid_test + def test_2d_object_array_does_not_copy(self): + arr = np.array([["a", "b"], ["c", "d"]], dtype="object") + df = DataFrame(arr) + assert np.shares_memory(df.values, arr) + def test_constructor_dtype_list_data(self): df = DataFrame([[1, "2"], [None, "a"]], dtype=object) assert df.loc[1, 0] is None @@ -2080,14 +2092,6 @@ def test_constructor_series_copy(self, float_frame): assert not (series["A"] == 5).all() - def test_object_array_does_not_copy(self): - a = np.array(["a", "b"], dtype="object") - b = np.array([["a", "b"], ["c", "d"]], dtype="object") - df = DataFrame(a) - assert np.shares_memory(df.values, a) - df2 = DataFrame(b) - assert np.shares_memory(df2.values, b) - def test_constructor_with_nas(self): # GH 5016 # na's in indices From 9e00908665b8903598f836dfb8d326e6bfa8f38e Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Sun, 11 Jul 2021 19:20:04 +0100 Subject: [PATCH 09/16] Revert "try not replacing _from_blocks with _from_array" This reverts commit 672f9c0be36cd6875e0570895b0d1b9230b4dfae. --- pandas/core/internals/construction.py | 45 +++++++++------------------ pandas/core/internals/managers.py | 20 ++++++++++++ 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 90fbea7804fb8..8cfd6e709ea4b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -76,15 +76,11 @@ ArrayManager, SingleArrayManager, ) -from pandas.core.internals.blocks import ( - ensure_block_shape, - new_block, -) from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, + create_block_manager_from_array, create_block_manager_from_arrays, - create_block_manager_from_blocks, ) if TYPE_CHECKING: @@ -340,42 +336,29 @@ def ndarray_to_mgr( return ArrayManager(arrays, [index, columns], verify_integrity=False) - values = values.T + array = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): - - if values.ndim == 2 and values.shape[0] != 1: + if dtype is None and is_object_dtype(array.dtype): + if array.ndim == 2 and array.shape[0] != 1: # transpose and separate blocks - - dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] + maybe_datetime = [ + maybe_infer_to_datetimelike(instance) for instance in array + ] # don't convert (and copy) the objects if no type inference occurs if any( - not is_dtype_equal(instance.dtype, values.dtype) - for instance in dtlike_vals + not is_dtype_equal(instance.dtype, array.dtype) + for instance in maybe_datetime ): - dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] - block_values = [ - new_block(dvals_list[n], placement=n, ndim=2) - for n in range(len(dvals_list)) - ] - else: - nb = new_block(values, placement=slice(len(columns)), ndim=2) - block_values = [nb] + return create_block_manager_from_arrays( + maybe_datetime, columns, [columns, index] + ) else: - datelike_vals = maybe_infer_to_datetimelike(values) - nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) - block_values = [nb] - else: - nb = new_block(values, placement=slice(len(columns)), ndim=2) - block_values = [nb] - - if len(columns) == 0: - block_values = [] + array = maybe_infer_to_datetimelike(array) - return create_block_manager_from_blocks(block_values, [columns, index]) + return create_block_manager_from_array(array, [columns, index]) def _check_values_indices_shape_match( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dca6ddf703446..9874663767cfa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1791,6 +1791,26 @@ def create_block_manager_from_arrays( return mgr +def create_block_manager_from_array( + array, + axes: list[Index], + consolidate: bool = True, +) -> BlockManager: + assert isinstance(axes, list) + assert all(isinstance(x, Index) for x in axes) + + array = _extract_array(array) + + try: + block = new_block(values=array, placement=slice(0, len(axes[0])), ndim=2) + mgr = BlockManager([block], axes) + except ValueError as e: + raise construction_error(array.shape[0], array.shape[1:], axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr + + def construction_error( tot_items: int, block_shape: Shape, From ba0d122f291bc8b8f80e6bd6cd53f142125e34f6 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Sun, 11 Jul 2021 19:22:19 +0100 Subject: [PATCH 10/16] move whatsnew to v1.4.0 --- doc/source/whatsnew/v1.3.1.rst | 2 +- doc/source/whatsnew/v1.4.0.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 2bf9024e9676a..9c17a22bf6d52 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :class:`DataFrame` constructor always copying 2D object arrays (:issue:`39272`) +- - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5b09b62fa9e88..238dd84289ed0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -176,6 +176,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`) +- Bug in :class:`DataFrame` constructor always copying 2D object arrays (:issue:`39272`) - Timedelta From f9512465f8a907a40caf694e72aebfb8347cc03f Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Sun, 11 Jul 2021 19:22:31 +0100 Subject: [PATCH 11/16] annotate tests with link to issue --- pandas/tests/frame/test_constructors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a5568dda56c67..c992606cc88af 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -255,12 +255,14 @@ def test_constructor_dtype_nocast_view_2d_array(self): @td.skip_array_manager_invalid_test def test_1d_object_array_does_not_copy(self): + # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr) assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test def test_2d_object_array_does_not_copy(self): + # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr) assert np.shares_memory(df.values, arr) From e9d4166a886b439097638d4c46fe04072bdfdc5c Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Sun, 11 Jul 2021 20:08:12 +0100 Subject: [PATCH 12/16] reword whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c69ab076d6c8d..43f517fe5a02f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -178,7 +178,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`) -- Bug in :class:`DataFrame` constructor always copying 2D object arrays (:issue:`39272`) +- Bug in :class:`DataFrame` constructor unnecessarily copying non-datetimelike 2D object arrays (:issue:`39272`) - Timedelta From e91fb3a9c3ca559cf6c3a1fce73b7265f5d2aacb Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Tue, 13 Jul 2021 21:55:20 +0100 Subject: [PATCH 13/16] Revert "Revert "try not replacing _from_blocks with _from_array"" This reverts commit 9e00908665b8903598f836dfb8d326e6bfa8f38e. --- pandas/core/internals/construction.py | 45 ++++++++++++++++++--------- pandas/core/internals/managers.py | 20 ------------ 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 57590ff1217fa..5441bd88d6bcc 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -78,11 +78,15 @@ ArrayManager, SingleArrayManager, ) +from pandas.core.internals.blocks import ( + ensure_block_shape, + new_block, +) from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_array, create_block_manager_from_arrays, + create_block_manager_from_blocks, ) if TYPE_CHECKING: @@ -338,29 +342,42 @@ def ndarray_to_mgr( return ArrayManager(arrays, [index, columns], verify_integrity=False) - array = values.T + values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(array.dtype): - if array.ndim == 2 and array.shape[0] != 1: + if dtype is None and is_object_dtype(values.dtype): + + if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks - maybe_datetime = [ - maybe_infer_to_datetimelike(instance) for instance in array - ] + + dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] # don't convert (and copy) the objects if no type inference occurs if any( - not is_dtype_equal(instance.dtype, array.dtype) - for instance in maybe_datetime + not is_dtype_equal(instance.dtype, values.dtype) + for instance in dtlike_vals ): - return create_block_manager_from_arrays( - maybe_datetime, columns, [columns, index] - ) + dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] + block_values = [ + new_block(dvals_list[n], placement=n, ndim=2) + for n in range(len(dvals_list)) + ] + else: + nb = new_block(values, placement=slice(len(columns)), ndim=2) + block_values = [nb] else: - array = maybe_infer_to_datetimelike(array) + datelike_vals = maybe_infer_to_datetimelike(values) + nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) + block_values = [nb] + else: + nb = new_block(values, placement=slice(len(columns)), ndim=2) + block_values = [nb] + + if len(columns) == 0: + block_values = [] - return create_block_manager_from_array(array, [columns, index]) + return create_block_manager_from_blocks(block_values, [columns, index]) def _check_values_indices_shape_match( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9874663767cfa..dca6ddf703446 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1791,26 +1791,6 @@ def create_block_manager_from_arrays( return mgr -def create_block_manager_from_array( - array, - axes: list[Index], - consolidate: bool = True, -) -> BlockManager: - assert isinstance(axes, list) - assert all(isinstance(x, Index) for x in axes) - - array = _extract_array(array) - - try: - block = new_block(values=array, placement=slice(0, len(axes[0])), ndim=2) - mgr = BlockManager([block], axes) - except ValueError as e: - raise construction_error(array.shape[0], array.shape[1:], axes, e) - if consolidate: - mgr._consolidate_inplace() - return mgr - - def construction_error( tot_items: int, block_shape: Shape, From 96dd1b9e1f3b3cbc3151a6217b3726c2d7eb5181 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Tue, 13 Jul 2021 22:11:12 +0100 Subject: [PATCH 14/16] Drop outside if/else --- pandas/core/internals/construction.py | 32 ++++++++++----------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5441bd88d6bcc..fdebc092476e2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -348,27 +348,19 @@ def ndarray_to_mgr( # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): - - if values.ndim == 2 and values.shape[0] != 1: - # transpose and separate blocks - - dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] - # don't convert (and copy) the objects if no type inference occurs - if any( - not is_dtype_equal(instance.dtype, values.dtype) - for instance in dtlike_vals - ): - dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] - block_values = [ - new_block(dvals_list[n], placement=n, ndim=2) - for n in range(len(dvals_list)) - ] - else: - nb = new_block(values, placement=slice(len(columns)), ndim=2) - block_values = [nb] + dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] + # don't convert (and copy) the objects if no type inference occurs + if any( + not is_dtype_equal(instance.dtype, values.dtype) + for instance in dtlike_vals + ): + dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] + block_values = [ + new_block(dvals_list[n], placement=n, ndim=2) + for n in range(len(dvals_list)) + ] else: - datelike_vals = maybe_infer_to_datetimelike(values) - nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) + nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) From dc2ae20da37fa26285275c51ef9467179d753401 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Wed, 14 Jul 2021 00:10:13 +0100 Subject: [PATCH 15/16] better performance? --- pandas/core/internals/construction.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index fdebc092476e2..a396b1469b7ae 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -348,13 +348,11 @@ def ndarray_to_mgr( # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): - dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] + obj_columns = [x for x in values] + maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs - if any( - not is_dtype_equal(instance.dtype, values.dtype) - for instance in dtlike_vals - ): - dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] + if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): + dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) From aa4523006411cebb17036bda25d423738b587392 Mon Sep 17 00:00:00 2001 From: Rafael Irgolic Date: Wed, 14 Jul 2021 19:45:21 +0100 Subject: [PATCH 16/16] lint --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index a396b1469b7ae..22cce5c614d5a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -348,7 +348,7 @@ def ndarray_to_mgr( # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): - obj_columns = [x for x in values] + obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):