diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a07149fe8171..461363d295f6a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,6 +154,7 @@ jobs: source activate pandas-dev pytest pandas/tests/frame/methods --array-manager pytest pandas/tests/arithmetic/ --array-manager + pytest pandas/tests/reshape/merge --array-manager # indexing subset (temporary since other tests don't pass yet) pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 132598e03d6c0..054ce8a40288b 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -12,7 +12,7 @@ TimeDeltaBlock, make_block, ) -from pandas.core.internals.concat import concatenate_block_managers +from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, @@ -35,7 +35,7 @@ "ArrayManager", "BlockManager", "SingleBlockManager", - "concatenate_block_managers", + "concatenate_managers", # those two are preserved here for downstream compatibility (GH-33892) "create_block_manager_from_arrays", "create_block_manager_from_blocks", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index e09a434170780..d38d278e89a67 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -831,7 +831,7 @@ def _reindex_indexer( new_axes = list(self._axes) new_axes[axis] = new_axis - return type(self)(new_arrays, new_axes) + return type(self)(new_arrays, new_axes, do_integrity_check=False) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 16440f7a4c2bf..362f2fde47e0b 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -49,7 +49,46 @@ from pandas import Index -def concatenate_block_managers( +def concatenate_array_managers( + mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool +) -> Manager: + """ + Concatenate array managers into one. + + Parameters + ---------- + mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + Returns + ------- + ArrayManager + """ + # reindex all arrays + mgrs = [] + for mgr, indexers in mgrs_indexers: + for ax, indexer in indexers.items(): + mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True) + mgrs.append(mgr) + + if concat_axis == 1: + # concatting along the rows -> concat the reindexed arrays + # TODO(ArrayManager) doesn't yet preserve the correct dtype + arrays = [ + concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))]) + for j in range(len(mgrs[0].arrays)) + ] + return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) + else: + # concatting along the columns -> combine reindexed arrays in a single manager + assert concat_axis == 0 + arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) + return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) + + +def concatenate_managers( mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool ) -> Manager: """ @@ -66,20 +105,9 @@ def concatenate_block_managers( ------- BlockManager """ + # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): - - if concat_axis == 1: - # TODO for now only fastpath without indexers - mgrs = [t[0] for t in mgrs_indexers] - arrays = [ - concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) - for j in range(len(mgrs[0].arrays)) - ] - return ArrayManager(arrays, [axes[1], axes[0]]) - elif concat_axis == 0: - mgrs = [t[0] for t in mgrs_indexers] - arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - return ArrayManager(arrays, [axes[1], axes[0]]) + return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9c536abbc7559..86df773147e21 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -282,6 +282,18 @@ def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return algos.take_nd(dtypes, self.blknos, allow_fill=False) + @property + def arrays(self): + """ + Quick access to the backing arrays of the Blocks. + + Only for compatibility with ArrayManager for testing convenience. + Not to be used in actual code, and return value is not the same as the + ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). + """ + for blk in self.blocks: + yield blk.values + def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 92fc4a2e85163..a8c6913cd5d6c 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -43,7 +43,7 @@ get_unanimous_names, ) import pandas.core.indexes.base as ibase -from pandas.core.internals import concatenate_block_managers +from pandas.core.internals import concatenate_managers if TYPE_CHECKING: from pandas import ( @@ -524,7 +524,7 @@ def get_result(self): mgrs_indexers.append((obj._mgr, indexers)) - new_data = concatenate_block_managers( + new_data = concatenate_managers( mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) if not self.copy: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 79d018427aa33..217d9915f834c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -76,7 +76,7 @@ import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc -from pandas.core.internals import concatenate_block_managers +from pandas.core.internals import concatenate_managers from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -720,7 +720,7 @@ def get_result(self): lindexers = {1: left_indexer} if left_indexer is not None else {} rindexers = {1: right_indexer} if right_indexer is not None else {} - result_data = concatenate_block_managers( + result_data = concatenate_managers( [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, @@ -1616,7 +1616,7 @@ def get_result(self): lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} - result_data = concatenate_block_managers( + result_data = concatenate_managers( [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index dc9a1565aad1e..0f51c4aef79db 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -161,7 +161,7 @@ def test_drop(self): assert return_value is None tm.assert_frame_equal(df, expected) - @td.skip_array_manager_not_yet_implemented + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_drop_multiindex_not_lexsorted(self): # GH#11640 diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index be80dd49ff1fb..bd0901387eeed 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -1,14 +1,9 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm -# TODO(ArrayManager) concat with reindexing -pytestmark = td.skip_array_manager_not_yet_implemented - def test_error(): df = pd.DataFrame( diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 90456ad949f59..1c7f7e3ff674a 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -15,9 +13,6 @@ ) import pandas._testing as tm -# TODO(ArrayManager) concat with reindexing -pytestmark = td.skip_array_manager_not_yet_implemented - @pytest.fixture def frame_with_period_index(): @@ -240,8 +235,9 @@ def test_join(self, multiindex_dataframe_random_data): b = frame.loc[frame.index[2:], ["B", "C"]] joined = a.join(b, how="outer").reindex(frame.index) - expected = frame.copy() - expected.values[np.isnan(joined.values)] = np.nan + expected = frame.copy().values + expected[np.isnan(joined.values)] = np.nan + expected = DataFrame(expected, index=frame.index, columns=frame.columns) assert not np.isnan(joined.values).all() diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 2339e21288bb5..24d1973eeda6d 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -121,7 +121,7 @@ def test_ambiguous_width(self): assert adjoined == expected -@td.skip_array_manager_not_yet_implemented +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON class TestTableSchemaRepr: @classmethod def setup_class(cls): diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index d9575a6ad81e5..3131131682ccd 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -247,7 +247,7 @@ def test_pickle_options(fsspectest): tm.assert_frame_equal(df, out) -@td.skip_array_manager_not_yet_implemented +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON def test_json_options(fsspectest): df = DataFrame({"a": [0]}) df.to_json("testmem://afile", storage_options={"test": "json_write"}) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index e5499c44be7d7..2ec94d4cebf5a 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -551,6 +553,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d9af59382ae79..e1b1e80a29a43 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -287,17 +287,27 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self): + def test_merge_nocopy(self, using_array_manager): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) - merged["a"] = 6 - assert (left["a"] == 6).all() + if using_array_manager: + # With ArrayManager, setting a column doesn't change the values inplace + # and thus does not propagate the changes to the original left/right + # dataframes -> need to check that no copy was made in a different way + # TODO(ArrayManager) we should be able to simplify this with a .loc + # setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10 + # but this currently replaces the array (_setitem_with_indexer_split_path) + assert merged._mgr.arrays[0] is left._mgr.arrays[0] + assert merged._mgr.arrays[2] is right._mgr.arrays[0] + else: + merged["a"] = 6 + assert (left["a"] == 6).all() - merged["d"] = "peekaboo" - assert (right["d"] == "peekaboo").all() + merged["d"] = "peekaboo" + assert (right["d"] == "peekaboo").all() def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -1381,7 +1391,10 @@ def test_merge_readonly(self): np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) - data1._mgr.blocks[0].values.flags.writeable = False + # make each underlying block array / column array read-only + for arr in data1._mgr.arrays: + arr.flags.writeable = False + data1.merge(data2) # no error