Skip to content

Commit 4583a04

Browse files
authored
PERF: pd.concat with EA-backed indexes (#49128)
1 parent 5a4339f commit 4583a04

File tree

7 files changed

+67
-1
lines changed

7 files changed

+67
-1
lines changed

asv_bench/benchmarks/array.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,6 @@ def time_setitem_list(self, multiple_chunks):
7171

7272
def time_setitem_slice(self, multiple_chunks):
7373
self.array[::10] = "foo"
74+
75+
def time_tolist(self, multiple_chunks):
76+
self.array.tolist()

asv_bench/benchmarks/join_merge.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from pandas import (
66
DataFrame,
7+
Index,
78
MultiIndex,
89
Series,
910
array,
@@ -92,6 +93,39 @@ def time_f_ordered(self, axis, ignore_index):
9293
concat(self.frame_f, axis=axis, ignore_index=ignore_index)
9394

9495

96+
class ConcatIndexDtype:
97+
98+
params = (
99+
["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"],
100+
[0, 1],
101+
[True, False],
102+
[True, False],
103+
)
104+
param_names = ["dtype", "axis", "sort", "is_monotonic"]
105+
106+
def setup(self, dtype, axis, sort, is_monotonic):
107+
N = 10_000
108+
if dtype == "datetime64[ns]":
109+
vals = date_range("1970-01-01", periods=N)
110+
elif dtype in ("int64", "Int64"):
111+
vals = np.arange(N, dtype=np.int64)
112+
elif dtype in ("string[python]", "string[pyarrow]"):
113+
vals = tm.makeStringIndex(N)
114+
else:
115+
raise NotImplementedError
116+
117+
idx = Index(vals, dtype=dtype)
118+
if is_monotonic:
119+
idx = idx.sort_values()
120+
else:
121+
idx = idx[::-1]
122+
123+
self.series = [Series(i, idx[i:]) for i in range(5)]
124+
125+
def time_concat_series(self, dtype, axis, sort, is_monotonic):
126+
concat(self.series, axis=axis, sort=sort)
127+
128+
95129
class Join:
96130

97131
params = [True, False]

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ Performance improvements
156156
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
157157
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
158158
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
159+
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
159160
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
160161
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
161162
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).

pandas/core/arrays/masked.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,15 @@ def to_numpy(
427427
data = self._data.astype(dtype, copy=copy)
428428
return data
429429

430+
@doc(ExtensionArray.tolist)
431+
def tolist(self):
432+
if self.ndim > 1:
433+
return [x.tolist() for x in self]
434+
if not self._hasna:
435+
# faster than list(self)
436+
return list(self._data)
437+
return list(self)
438+
430439
@overload
431440
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
432441
...

pandas/core/arrays/string_.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
)
2121
from pandas.compat import pa_version_under6p0
2222
from pandas.compat.numpy import function as nv
23+
from pandas.util._decorators import doc
2324

2425
from pandas.core.dtypes.base import (
2526
ExtensionDtype,
@@ -214,7 +215,11 @@ class BaseStringArray(ExtensionArray):
214215
Mixin class for StringArray, ArrowStringArray.
215216
"""
216217

217-
pass
218+
@doc(ExtensionArray.tolist)
219+
def tolist(self):
220+
if self.ndim > 1:
221+
return [x.tolist() for x in self]
222+
return list(self.to_numpy())
218223

219224

220225
class StringArray(BaseStringArray, PandasArray):

pandas/tests/arrays/masked/test_function.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,9 @@ def test_round(data, numpy_dtype):
4949
dtype=data.dtype,
5050
)
5151
tm.assert_extension_array_equal(result, expected)
52+
53+
54+
def test_tolist(data):
55+
result = data.tolist()
56+
expected = list(data)
57+
tm.assert_equal(result, expected)

pandas/tests/arrays/string_/test_string.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,3 +595,11 @@ def test_setitem_scalar_with_mask_validation(dtype):
595595
msg = "Scalar must be NA or str"
596596
with pytest.raises(ValueError, match=msg):
597597
ser[mask] = 1
598+
599+
600+
def test_tolist(dtype):
601+
vals = ["a", "b", "c"]
602+
arr = pd.array(vals, dtype=dtype)
603+
result = arr.tolist()
604+
expected = vals
605+
tm.assert_equal(result, expected)

0 commit comments

Comments
 (0)