Skip to content

Commit e19403d

Browse files
[ArrayManager] Implement concat with axis=1 (merge/join) (#39841)
1 parent 3289f82 commit e19403d

File tree

14 files changed

+91
-43
lines changed

14 files changed

+91
-43
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ jobs:
154154
source activate pandas-dev
155155
pytest pandas/tests/frame/methods --array-manager
156156
pytest pandas/tests/arithmetic/ --array-manager
157+
pytest pandas/tests/reshape/merge --array-manager
157158
158159
# indexing subset (temporary since other tests don't pass yet)
159160
pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager

pandas/core/internals/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
TimeDeltaBlock,
1313
make_block,
1414
)
15-
from pandas.core.internals.concat import concatenate_block_managers
15+
from pandas.core.internals.concat import concatenate_managers
1616
from pandas.core.internals.managers import (
1717
BlockManager,
1818
SingleBlockManager,
@@ -35,7 +35,7 @@
3535
"ArrayManager",
3636
"BlockManager",
3737
"SingleBlockManager",
38-
"concatenate_block_managers",
38+
"concatenate_managers",
3939
# those two are preserved here for downstream compatibility (GH-33892)
4040
"create_block_manager_from_arrays",
4141
"create_block_manager_from_blocks",

pandas/core/internals/array_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,7 @@ def _reindex_indexer(
831831
new_axes = list(self._axes)
832832
new_axes[axis] = new_axis
833833

834-
return type(self)(new_arrays, new_axes)
834+
return type(self)(new_arrays, new_axes, do_integrity_check=False)
835835

836836
def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True):
837837
"""

pandas/core/internals/concat.py

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,46 @@
4949
from pandas import Index
5050

5151

52-
def concatenate_block_managers(
52+
def concatenate_array_managers(
53+
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
54+
) -> Manager:
55+
"""
56+
Concatenate array managers into one.
57+
58+
Parameters
59+
----------
60+
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
61+
axes : list of Index
62+
concat_axis : int
63+
copy : bool
64+
65+
Returns
66+
-------
67+
ArrayManager
68+
"""
69+
# reindex all arrays
70+
mgrs = []
71+
for mgr, indexers in mgrs_indexers:
72+
for ax, indexer in indexers.items():
73+
mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True)
74+
mgrs.append(mgr)
75+
76+
if concat_axis == 1:
77+
# concatting along the rows -> concat the reindexed arrays
78+
# TODO(ArrayManager) doesn't yet preserve the correct dtype
79+
arrays = [
80+
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))])
81+
for j in range(len(mgrs[0].arrays))
82+
]
83+
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
84+
else:
85+
# concatting along the columns -> combine reindexed arrays in a single manager
86+
assert concat_axis == 0
87+
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
88+
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
89+
90+
91+
def concatenate_managers(
5392
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
5493
) -> Manager:
5594
"""
@@ -66,20 +105,9 @@ def concatenate_block_managers(
66105
-------
67106
BlockManager
68107
"""
108+
# TODO(ArrayManager) this assumes that all managers are of the same type
69109
if isinstance(mgrs_indexers[0][0], ArrayManager):
70-
71-
if concat_axis == 1:
72-
# TODO for now only fastpath without indexers
73-
mgrs = [t[0] for t in mgrs_indexers]
74-
arrays = [
75-
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0)
76-
for j in range(len(mgrs[0].arrays))
77-
]
78-
return ArrayManager(arrays, [axes[1], axes[0]])
79-
elif concat_axis == 0:
80-
mgrs = [t[0] for t in mgrs_indexers]
81-
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
82-
return ArrayManager(arrays, [axes[1], axes[0]])
110+
return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
83111

84112
concat_plans = [
85113
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers

pandas/core/internals/managers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,18 @@ def get_dtypes(self):
282282
dtypes = np.array([blk.dtype for blk in self.blocks])
283283
return algos.take_nd(dtypes, self.blknos, allow_fill=False)
284284

285+
@property
286+
def arrays(self):
287+
"""
288+
Quick access to the backing arrays of the Blocks.
289+
290+
Only for compatibility with ArrayManager for testing convenience.
291+
Not to be used in actual code, and return value is not the same as the
292+
ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).
293+
"""
294+
for blk in self.blocks:
295+
yield blk.values
296+
285297
def __getstate__(self):
286298
block_values = [b.values for b in self.blocks]
287299
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]

pandas/core/reshape/concat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
get_unanimous_names,
4444
)
4545
import pandas.core.indexes.base as ibase
46-
from pandas.core.internals import concatenate_block_managers
46+
from pandas.core.internals import concatenate_managers
4747

4848
if TYPE_CHECKING:
4949
from pandas import (
@@ -524,7 +524,7 @@ def get_result(self):
524524

525525
mgrs_indexers.append((obj._mgr, indexers))
526526

527-
new_data = concatenate_block_managers(
527+
new_data = concatenate_managers(
528528
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
529529
)
530530
if not self.copy:

pandas/core/reshape/merge.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
import pandas.core.common as com
7777
from pandas.core.construction import extract_array
7878
from pandas.core.frame import _merge_doc
79-
from pandas.core.internals import concatenate_block_managers
79+
from pandas.core.internals import concatenate_managers
8080
from pandas.core.sorting import is_int64_overflow_possible
8181

8282
if TYPE_CHECKING:
@@ -720,7 +720,7 @@ def get_result(self):
720720
lindexers = {1: left_indexer} if left_indexer is not None else {}
721721
rindexers = {1: right_indexer} if right_indexer is not None else {}
722722

723-
result_data = concatenate_block_managers(
723+
result_data = concatenate_managers(
724724
[(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
725725
axes=[llabels.append(rlabels), join_index],
726726
concat_axis=0,
@@ -1616,7 +1616,7 @@ def get_result(self):
16161616
lindexers = {1: left_join_indexer} if left_join_indexer is not None else {}
16171617
rindexers = {1: right_join_indexer} if right_join_indexer is not None else {}
16181618

1619-
result_data = concatenate_block_managers(
1619+
result_data = concatenate_managers(
16201620
[(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
16211621
axes=[llabels.append(rlabels), join_index],
16221622
concat_axis=0,

pandas/tests/frame/methods/test_drop.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def test_drop(self):
161161
assert return_value is None
162162
tm.assert_frame_equal(df, expected)
163163

164-
@td.skip_array_manager_not_yet_implemented
164+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby
165165
def test_drop_multiindex_not_lexsorted(self):
166166
# GH#11640
167167

pandas/tests/frame/methods/test_explode.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
11
import numpy as np
22
import pytest
33

4-
import pandas.util._test_decorators as td
5-
64
import pandas as pd
75
import pandas._testing as tm
86

9-
# TODO(ArrayManager) concat with reindexing
10-
pytestmark = td.skip_array_manager_not_yet_implemented
11-
127

138
def test_error():
149
df = pd.DataFrame(

pandas/tests/frame/methods/test_join.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
import pandas.util._test_decorators as td
7-
86
import pandas as pd
97
from pandas import (
108
DataFrame,
@@ -15,9 +13,6 @@
1513
)
1614
import pandas._testing as tm
1715

18-
# TODO(ArrayManager) concat with reindexing
19-
pytestmark = td.skip_array_manager_not_yet_implemented
20-
2116

2217
@pytest.fixture
2318
def frame_with_period_index():
@@ -240,8 +235,9 @@ def test_join(self, multiindex_dataframe_random_data):
240235
b = frame.loc[frame.index[2:], ["B", "C"]]
241236

242237
joined = a.join(b, how="outer").reindex(frame.index)
243-
expected = frame.copy()
244-
expected.values[np.isnan(joined.values)] = np.nan
238+
expected = frame.copy().values
239+
expected[np.isnan(joined.values)] = np.nan
240+
expected = DataFrame(expected, index=frame.index, columns=frame.columns)
245241

246242
assert not np.isnan(joined.values).all()
247243

pandas/tests/io/formats/test_printing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def test_ambiguous_width(self):
121121
assert adjoined == expected
122122

123123

124-
@td.skip_array_manager_not_yet_implemented
124+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON
125125
class TestTableSchemaRepr:
126126
@classmethod
127127
def setup_class(cls):

pandas/tests/io/test_fsspec.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def test_pickle_options(fsspectest):
247247
tm.assert_frame_equal(df, out)
248248

249249

250-
@td.skip_array_manager_not_yet_implemented
250+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON
251251
def test_json_options(fsspectest):
252252
df = DataFrame({"a": [0]})
253253
df.to_json("testmem://afile", storage_options={"test": "json_write"})

pandas/tests/reshape/merge/test_join.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
import pandas.util._test_decorators as td
5+
46
import pandas as pd
57
from pandas import (
68
Categorical,
@@ -551,6 +553,7 @@ def test_join_non_unique_period_index(self):
551553
)
552554
tm.assert_frame_equal(result, expected)
553555

556+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby
554557
def test_mixed_type_join_with_suffix(self):
555558
# GH #916
556559
df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])

pandas/tests/reshape/merge/test_merge.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -287,17 +287,27 @@ def test_merge_copy(self):
287287
merged["d"] = "peekaboo"
288288
assert (right["d"] == "bar").all()
289289

290-
def test_merge_nocopy(self):
290+
def test_merge_nocopy(self, using_array_manager):
291291
left = DataFrame({"a": 0, "b": 1}, index=range(10))
292292
right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))
293293

294294
merged = merge(left, right, left_index=True, right_index=True, copy=False)
295295

296-
merged["a"] = 6
297-
assert (left["a"] == 6).all()
296+
if using_array_manager:
297+
# With ArrayManager, setting a column doesn't change the values inplace
298+
# and thus does not propagate the changes to the original left/right
299+
# dataframes -> need to check that no copy was made in a different way
300+
# TODO(ArrayManager) we should be able to simplify this with a .loc
301+
# setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10
302+
# but this currently replaces the array (_setitem_with_indexer_split_path)
303+
assert merged._mgr.arrays[0] is left._mgr.arrays[0]
304+
assert merged._mgr.arrays[2] is right._mgr.arrays[0]
305+
else:
306+
merged["a"] = 6
307+
assert (left["a"] == 6).all()
298308

299-
merged["d"] = "peekaboo"
300-
assert (right["d"] == "peekaboo").all()
309+
merged["d"] = "peekaboo"
310+
assert (right["d"] == "peekaboo").all()
301311

302312
def test_intelligently_handle_join_key(self):
303313
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
@@ -1381,7 +1391,10 @@ def test_merge_readonly(self):
13811391
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
13821392
)
13831393

1384-
data1._mgr.blocks[0].values.flags.writeable = False
1394+
# make each underlying block array / column array read-only
1395+
for arr in data1._mgr.arrays:
1396+
arr.flags.writeable = False
1397+
13851398
data1.merge(data2) # no error
13861399

13871400

0 commit comments

Comments
 (0)