Skip to content

[ArrayManager] Implement concat with axis=1 (merge/join) #39841

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ jobs:
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/arithmetic/ --array-manager
pytest pandas/tests/reshape/merge --array-manager
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only reshape/merge and not reshape/concat, as there are too many tests still failing in the concat tests (because axis=0 is not yet handled properly)


# indexing subset (temporary since other tests don't pass yet)
pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
TimeDeltaBlock,
make_block,
)
from pandas.core.internals.concat import concatenate_block_managers
from pandas.core.internals.concat import concatenate_managers
from pandas.core.internals.managers import (
BlockManager,
SingleBlockManager,
Expand All @@ -35,7 +35,7 @@
"ArrayManager",
"BlockManager",
"SingleBlockManager",
"concatenate_block_managers",
"concatenate_managers",
# those two are preserved here for downstream compatibility (GH-33892)
"create_block_manager_from_arrays",
"create_block_manager_from_blocks",
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def _reindex_indexer(
new_axes = list(self._axes)
new_axes[axis] = new_axis

return type(self)(new_arrays, new_axes)
return type(self)(new_arrays, new_axes, do_integrity_check=False)

def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True):
"""
Expand Down
56 changes: 42 additions & 14 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,46 @@
from pandas import Index


def concatenate_block_managers(
def concatenate_array_managers(
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
) -> Manager:
"""
Concatenate array managers into one.

Parameters
----------
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool

Returns
-------
ArrayManager
"""
# reindex all arrays
mgrs = []
for mgr, indexers in mgrs_indexers:
for ax, indexer in indexers.items():
mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True)
mgrs.append(mgr)

if concat_axis == 1:
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
else:
# concatting along the columns -> combine reindexed arrays in a single manager
assert concat_axis == 0
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)


def concatenate_managers(
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
) -> Manager:
"""
Expand All @@ -66,20 +105,9 @@ def concatenate_block_managers(
-------
BlockManager
"""
# TODO(ArrayManager) this assumes that all managers are of the same type
if isinstance(mgrs_indexers[0][0], ArrayManager):

if concat_axis == 1:
# TODO for now only fastpath without indexers
mgrs = [t[0] for t in mgrs_indexers]
arrays = [
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0)
for j in range(len(mgrs[0].arrays))
]
return ArrayManager(arrays, [axes[1], axes[0]])
elif concat_axis == 0:
mgrs = [t[0] for t in mgrs_indexers]
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
return ArrayManager(arrays, [axes[1], axes[0]])
return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)

concat_plans = [
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,18 @@ def get_dtypes(self):
dtypes = np.array([blk.dtype for blk in self.blocks])
return algos.take_nd(dtypes, self.blknos, allow_fill=False)

@property
def arrays(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

annotate

"""
Quick access to the backing arrays of the Blocks.

Only for compatibility with ArrayManager for testing convenience.
Not to be used in actual code, and return value is not the same as the
ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).
"""
for blk in self.blocks:
yield blk.values

def __getstate__(self):
block_values = [b.values for b in self.blocks]
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
get_unanimous_names,
)
import pandas.core.indexes.base as ibase
from pandas.core.internals import concatenate_block_managers
from pandas.core.internals import concatenate_managers

if TYPE_CHECKING:
from pandas import (
Expand Down Expand Up @@ -524,7 +524,7 @@ def get_result(self):

mgrs_indexers.append((obj._mgr, indexers))

new_data = concatenate_block_managers(
new_data = concatenate_managers(
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
)
if not self.copy:
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.frame import _merge_doc
from pandas.core.internals import concatenate_block_managers
from pandas.core.internals import concatenate_managers
from pandas.core.sorting import is_int64_overflow_possible

if TYPE_CHECKING:
Expand Down Expand Up @@ -720,7 +720,7 @@ def get_result(self):
lindexers = {1: left_indexer} if left_indexer is not None else {}
rindexers = {1: right_indexer} if right_indexer is not None else {}

result_data = concatenate_block_managers(
result_data = concatenate_managers(
[(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
axes=[llabels.append(rlabels), join_index],
concat_axis=0,
Expand Down Expand Up @@ -1616,7 +1616,7 @@ def get_result(self):
lindexers = {1: left_join_indexer} if left_join_indexer is not None else {}
rindexers = {1: right_join_indexer} if right_join_indexer is not None else {}

result_data = concatenate_block_managers(
result_data = concatenate_managers(
[(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
axes=[llabels.append(rlabels), join_index],
concat_axis=0,
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def test_drop(self):
assert return_value is None
tm.assert_frame_equal(df, expected)

@td.skip_array_manager_not_yet_implemented
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby
def test_drop_multiindex_not_lexsorted(self):
# GH#11640

Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/frame/methods/test_explode.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm

# TODO(ArrayManager) concat with reindexing
pytestmark = td.skip_array_manager_not_yet_implemented


def test_error():
df = pd.DataFrame(
Expand Down
10 changes: 3 additions & 7 deletions pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -15,9 +13,6 @@
)
import pandas._testing as tm

# TODO(ArrayManager) concat with reindexing
pytestmark = td.skip_array_manager_not_yet_implemented


@pytest.fixture
def frame_with_period_index():
Expand Down Expand Up @@ -240,8 +235,9 @@ def test_join(self, multiindex_dataframe_random_data):
b = frame.loc[frame.index[2:], ["B", "C"]]

joined = a.join(b, how="outer").reindex(frame.index)
expected = frame.copy()
expected.values[np.isnan(joined.values)] = np.nan
expected = frame.copy().values
expected[np.isnan(joined.values)] = np.nan
expected = DataFrame(expected, index=frame.index, columns=frame.columns)

assert not np.isnan(joined.values).all()

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/formats/test_printing.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def test_ambiguous_width(self):
assert adjoined == expected


@td.skip_array_manager_not_yet_implemented
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON
class TestTableSchemaRepr:
@classmethod
def setup_class(cls):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/test_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def test_pickle_options(fsspectest):
tm.assert_frame_equal(df, out)


@td.skip_array_manager_not_yet_implemented
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON
def test_json_options(fsspectest):
df = DataFrame({"a": [0]})
df.to_json("testmem://afile", storage_options={"test": "json_write"})
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -551,6 +553,7 @@ def test_join_non_unique_period_index(self):
)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby
def test_mixed_type_join_with_suffix(self):
# GH #916
df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
Expand Down
25 changes: 19 additions & 6 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,17 +287,27 @@ def test_merge_copy(self):
merged["d"] = "peekaboo"
assert (right["d"] == "bar").all()

def test_merge_nocopy(self):
def test_merge_nocopy(self, using_array_manager):
left = DataFrame({"a": 0, "b": 1}, index=range(10))
right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))

merged = merge(left, right, left_index=True, right_index=True, copy=False)

merged["a"] = 6
assert (left["a"] == 6).all()
if using_array_manager:
# With ArrayManager, setting a column doesn't change the values inplace
# and thus does not propagate the changes to the original left/right
# dataframes -> need to check that no copy was made in a different way
# TODO(ArrayManager) we should be able to simplify this with a .loc
# setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10
# but this currently replaces the array (_setitem_with_indexer_split_path)
assert merged._mgr.arrays[0] is left._mgr.arrays[0]
assert merged._mgr.arrays[2] is right._mgr.arrays[0]
else:
merged["a"] = 6
assert (left["a"] == 6).all()

merged["d"] = "peekaboo"
assert (right["d"] == "peekaboo").all()
merged["d"] = "peekaboo"
assert (right["d"] == "peekaboo").all()

def test_intelligently_handle_join_key(self):
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
Expand Down Expand Up @@ -1381,7 +1391,10 @@ def test_merge_readonly(self):
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
)

data1._mgr.blocks[0].values.flags.writeable = False
# make each underlying block array / column array read-only
for arr in data1._mgr.arrays:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cabn you comment on this (I know what you are doing by seeing what you added, but is non-obvious here i think)

arr.flags.writeable = False

data1.merge(data2) # no error


Expand Down