Skip to content

Commit 02768ec

Browse files
committed
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
2 parents c6cf9ee + 52b8459 commit 02768ec

File tree

18 files changed

+209
-63
lines changed

18 files changed

+209
-63
lines changed

doc/source/ecosystem.rst

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ arrays can be stored inside pandas' Series and DataFrame.
475475
`Pandas-Genomics`_
476476
~~~~~~~~~~~~~~~~~~
477477

478-
Pandas-Genomics provides extension types and extension arrays for working with genomics data
478+
Pandas-Genomics provides extension types, extension arrays, and extension accessors for working with genomics data
479479

480480
`Pint-Pandas`_
481481
~~~~~~~~~~~~~~
@@ -502,16 +502,17 @@ A directory of projects providing
502502
:ref:`extension accessors <extending.register-accessors>`. This is for users to
503503
discover new accessors and for library authors to coordinate on the namespace.
504504

505-
=============== ============ ==================================== ===============================================================
506-
Library Accessor Classes Description
507-
=============== ============ ==================================== ===============================================================
508-
`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses.
509-
`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library.
510-
`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series.
511-
`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames.
512-
`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing.
513-
`datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers.
514-
=============== ============ ==================================== ===============================================================
505+
================== ============ ==================================== ===============================================================================
506+
Library Accessor Classes Description
507+
================== ============ ==================================== ===============================================================================
508+
`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses.
509+
`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library.
510+
`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data
511+
`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series.
512+
`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames.
513+
`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing.
514+
`datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers.
515+
================== ============ ==================================== ===============================================================================
515516

516517
.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest
517518
.. _pdvega: https://altair-viz.github.io/pdvega/

doc/source/whatsnew/v1.2.4.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717

1818
- Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`)
1919
- Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`)
20+
- Fixed regression in (in)equality comparison of ``pd.NaT`` with a non-datetimelike numpy array returning a scalar instead of an array (:issue:`40722`)
2021
- Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`)
2122
- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` when ``regex`` was a multi-key dictionary (:issue:`39338`)
2223
-

doc/source/whatsnew/v1.3.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ Numeric
563563
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
564564
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
565565
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
566+
- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
566567
-
567568

568569
Conversion

pandas/_libs/algos.pyx

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -947,12 +947,14 @@ def rank_1d(
947947
TiebreakEnumType tiebreak
948948
Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
949949
Py_ssize_t grp_vals_seen=1, grp_na_count=0
950-
ndarray[int64_t, ndim=1] lexsort_indexer
951-
ndarray[float64_t, ndim=1] grp_sizes, out
950+
ndarray[int64_t, ndim=1] grp_sizes
951+
ndarray[intp_t, ndim=1] lexsort_indexer
952+
ndarray[float64_t, ndim=1] out
952953
ndarray[rank_t, ndim=1] masked_vals
953954
ndarray[uint8_t, ndim=1] mask
954955
bint keep_na, at_end, next_val_diff, check_labels, group_changed
955956
rank_t nan_fill_val
957+
int64_t grp_size
956958

957959
tiebreak = tiebreakers[ties_method]
958960
if tiebreak == TIEBREAK_FIRST:
@@ -965,7 +967,7 @@ def rank_1d(
965967
# TODO Cython 3.0: cast won't be necessary (#2992)
966968
assert <Py_ssize_t>len(labels) == N
967969
out = np.empty(N)
968-
grp_sizes = np.ones(N)
970+
grp_sizes = np.ones(N, dtype=np.int64)
969971

970972
# If all 0 labels, can short-circuit later label
971973
# comparisons
@@ -1022,7 +1024,7 @@ def rank_1d(
10221024
# each label corresponds to a different group value,
10231025
# the mask helps you differentiate missing values before
10241026
# performing sort on the actual values
1025-
lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False)
1027+
lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
10261028

10271029
if not ascending:
10281030
lexsort_indexer = lexsort_indexer[::-1]
@@ -1093,13 +1095,15 @@ def rank_1d(
10931095
for j in range(i - dups + 1, i + 1):
10941096
out[lexsort_indexer[j]] = grp_vals_seen
10951097

1096-
# Look forward to the next value (using the sorting in lexsort_indexer)
1097-
# if the value does not equal the current value then we need to
1098-
# reset the dups and sum_ranks, knowing that a new value is
1099-
# coming up. The conditional also needs to handle nan equality
1100-
# and the end of iteration
1101-
if next_val_diff or (mask[lexsort_indexer[i]]
1102-
^ mask[lexsort_indexer[i+1]]):
1098+
# Look forward to the next value (using the sorting in
1099+
# lexsort_indexer). If the value does not equal the current
1100+
# value then we need to reset the dups and sum_ranks, knowing
1101+
# that a new value is coming up. The conditional also needs
1102+
# to handle nan equality and the end of iteration. If group
1103+
# changes we do not record seeing a new value in the group
1104+
if not group_changed and (next_val_diff or
1105+
(mask[lexsort_indexer[i]]
1106+
^ mask[lexsort_indexer[i+1]])):
11031107
dups = sum_ranks = 0
11041108
grp_vals_seen += 1
11051109

@@ -1110,14 +1114,21 @@ def rank_1d(
11101114
# group encountered (used by pct calculations later). Also be
11111115
# sure to reset any of the items helping to calculate dups
11121116
if group_changed:
1117+
1118+
# If not dense tiebreak, group size used to compute
1119+
# percentile will be # of non-null elements in group
11131120
if tiebreak != TIEBREAK_DENSE:
1114-
for j in range(grp_start, i + 1):
1115-
grp_sizes[lexsort_indexer[j]] = \
1116-
(i - grp_start + 1 - grp_na_count)
1121+
grp_size = i - grp_start + 1 - grp_na_count
1122+
1123+
# Otherwise, it will be the number of distinct values
1124+
# in the group, subtracting 1 if NaNs are present
1125+
# since that is a distinct value we shouldn't count
11171126
else:
1118-
for j in range(grp_start, i + 1):
1119-
grp_sizes[lexsort_indexer[j]] = \
1120-
(grp_vals_seen - 1 - (grp_na_count > 0))
1127+
grp_size = grp_vals_seen - (grp_na_count > 0)
1128+
1129+
for j in range(grp_start, i + 1):
1130+
grp_sizes[lexsort_indexer[j]] = grp_size
1131+
11211132
dups = sum_ranks = 0
11221133
grp_na_count = 0
11231134
grp_start = i + 1
@@ -1184,12 +1195,14 @@ def rank_1d(
11841195
out[lexsort_indexer[j]] = grp_vals_seen
11851196

11861197
# Look forward to the next value (using the sorting in
1187-
# lexsort_indexer) if the value does not equal the current
1198+
# lexsort_indexer). If the value does not equal the current
11881199
# value then we need to reset the dups and sum_ranks, knowing
11891200
# that a new value is coming up. The conditional also needs
1190-
# to handle nan equality and the end of iteration
1191-
if next_val_diff or (mask[lexsort_indexer[i]]
1192-
^ mask[lexsort_indexer[i+1]]):
1201+
# to handle nan equality and the end of iteration. If group
1202+
# changes we do not record seeing a new value in the group
1203+
if not group_changed and (next_val_diff or
1204+
(mask[lexsort_indexer[i]]
1205+
^ mask[lexsort_indexer[i+1]])):
11931206
dups = sum_ranks = 0
11941207
grp_vals_seen += 1
11951208

@@ -1200,14 +1213,21 @@ def rank_1d(
12001213
# group encountered (used by pct calculations later). Also be
12011214
# sure to reset any of the items helping to calculate dups
12021215
if group_changed:
1216+
1217+
# If not dense tiebreak, group size used to compute
1218+
# percentile will be # of non-null elements in group
12031219
if tiebreak != TIEBREAK_DENSE:
1204-
for j in range(grp_start, i + 1):
1205-
grp_sizes[lexsort_indexer[j]] = \
1206-
(i - grp_start + 1 - grp_na_count)
1220+
grp_size = i - grp_start + 1 - grp_na_count
1221+
1222+
# Otherwise, it will be the number of distinct values
1223+
# in the group, subtracting 1 if NaNs are present
1224+
# since that is a distinct value we shouldn't count
12071225
else:
1208-
for j in range(grp_start, i + 1):
1209-
grp_sizes[lexsort_indexer[j]] = \
1210-
(grp_vals_seen - 1 - (grp_na_count > 0))
1226+
grp_size = grp_vals_seen - (grp_na_count > 0)
1227+
1228+
for j in range(grp_start, i + 1):
1229+
grp_sizes[lexsort_indexer[j]] = grp_size
1230+
12111231
dups = sum_ranks = 0
12121232
grp_na_count = 0
12131233
grp_start = i + 1

pandas/_libs/tslibs/nattype.pyx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ cdef class _NaT(datetime):
127127
result.fill(_nat_scalar_rules[op])
128128
elif other.dtype.kind == "O":
129129
result = np.array([PyObject_RichCompare(self, x, op) for x in other])
130+
elif op == Py_EQ:
131+
result = np.zeros(other.shape, dtype=bool)
132+
elif op == Py_NE:
133+
result = np.ones(other.shape, dtype=bool)
130134
else:
131135
return NotImplemented
132136
return result

pandas/conftest.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,6 +1131,24 @@ def string_dtype(request):
11311131
return request.param
11321132

11331133

1134+
@pytest.fixture(
1135+
params=[
1136+
"string",
1137+
pytest.param(
1138+
"arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
1139+
),
1140+
]
1141+
)
1142+
def nullable_string_dtype(request):
1143+
"""
1144+
Parametrized fixture for string dtypes.
1145+
1146+
* 'string'
1147+
* 'arrow_string'
1148+
"""
1149+
return request.param
1150+
1151+
11341152
@pytest.fixture(params=tm.BYTES_DTYPES)
11351153
def bytes_dtype(request):
11361154
"""

pandas/core/frame.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,9 @@ class DataFrame(NDFrame, OpsMixin):
472472
Index to use for resulting frame. Will default to RangeIndex if
473473
no indexing information part of input data and no index provided.
474474
columns : Index or array-like
475-
Column labels to use for resulting frame. Will default to
476-
RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
475+
Column labels to use for resulting frame when data does not have them,
476+
defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
477+
will perform column selection instead.
477478
dtype : dtype, default None
478479
Data type to force. Only a single dtype is allowed. If None, infer.
479480
copy : bool or None, default None
@@ -527,6 +528,18 @@ class DataFrame(NDFrame, OpsMixin):
527528
1 4 5 6
528529
2 7 8 9
529530
531+
Constructing DataFrame from a numpy ndarray that has labeled columns:
532+
533+
>>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
534+
... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
535+
>>> df3 = pd.DataFrame(data, columns=['c', 'a'])
536+
...
537+
>>> df3
538+
c a
539+
0 3 1
540+
1 6 4
541+
2 9 7
542+
530543
Constructing DataFrame from dataclass:
531544
532545
>>> from dataclasses import make_dataclass

pandas/plotting/_matplotlib/compat.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ def inner():
2222
mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge)
2323
mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge)
2424
mpl_ge_3_3_0 = _mpl_version("3.3.0", operator.ge)
25+
mpl_ge_3_4_0 = _mpl_version("3.4.0", operator.ge)

pandas/plotting/_matplotlib/tools.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,11 @@ def handle_shared_axes(
392392
row_num = lambda x: x.rowNum
393393
col_num = lambda x: x.colNum
394394

395+
if compat.mpl_ge_3_4_0():
396+
is_first_col = lambda x: x.get_subplotspec().is_first_col()
397+
else:
398+
is_first_col = lambda x: x.is_first_col()
399+
395400
if nrows > 1:
396401
try:
397402
# first find out the ax layout,
@@ -423,7 +428,7 @@ def handle_shared_axes(
423428
# only the first column should get y labels -> set all other to
424429
# off as we only have labels in the first column and we always
425430
# have a subplot there, we can skip the layout test
426-
if ax.is_first_col():
431+
if is_first_col(ax):
427432
continue
428433
if sharey or _has_externally_shared_axis(ax, "y"):
429434
_remove_labels_from_axis(ax.yaxis)

pandas/tests/dtypes/test_common.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,10 @@ def test_is_string_dtype():
281281
assert com.is_string_dtype(object)
282282
assert com.is_string_dtype(np.array(["a", "b"]))
283283
assert com.is_string_dtype(pd.StringDtype())
284-
assert com.is_string_dtype(pd.array(["a", "b"], dtype="string"))
284+
285+
286+
def test_is_string_dtype_nullable(nullable_string_dtype):
287+
assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))
285288

286289

287290
integer_dtypes: List = []

pandas/tests/extension/json/array.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
ExtensionDtype,
4040
)
4141
from pandas.api.types import is_bool_dtype
42+
from pandas.core.arrays.string_arrow import ArrowStringDtype
4243

4344

4445
class JSONDtype(ExtensionDtype):
@@ -194,7 +195,7 @@ def astype(self, dtype, copy=True):
194195
if copy:
195196
return self.copy()
196197
return self
197-
elif isinstance(dtype, StringDtype):
198+
elif isinstance(dtype, (StringDtype, ArrowStringDtype)):
198199
value = self.astype(str) # numpy doesn'y like nested dicts
199200
return dtype.construct_array_type()._from_sequence(value, copy=False)
200201

pandas/tests/frame/methods/test_combine_first.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -381,15 +381,17 @@ def test_combine_first_with_asymmetric_other(self, val):
381381

382382
tm.assert_frame_equal(res, exp)
383383

384-
def test_combine_first_string_dtype_only_na(self):
384+
def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
385385
# GH: 37519
386-
df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string")
387-
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string")
386+
df = DataFrame(
387+
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
388+
)
389+
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
388390
df.set_index(["a", "b"], inplace=True)
389391
df2.set_index(["a", "b"], inplace=True)
390392
result = df.combine_first(df2)
391393
expected = DataFrame(
392-
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string"
394+
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
393395
).set_index(["a", "b"])
394396
tm.assert_frame_equal(result, expected)
395397

pandas/tests/frame/test_constructors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1654,10 +1654,10 @@ def test_constructor_empty_with_string_dtype(self):
16541654
df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
16551655
tm.assert_frame_equal(df, expected)
16561656

1657-
def test_constructor_empty_with_string_extension(self):
1657+
def test_constructor_empty_with_string_extension(self, nullable_string_dtype):
16581658
# GH 34915
1659-
expected = DataFrame(index=[], columns=["c1"], dtype="string")
1660-
df = DataFrame(columns=["c1"], dtype="string")
1659+
expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype)
1660+
df = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
16611661
tm.assert_frame_equal(df, expected)
16621662

16631663
def test_constructor_single_value(self):

pandas/tests/groupby/test_rank.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,3 +542,28 @@ def test_rank_min_int():
542542
)
543543

544544
tm.assert_frame_equal(result, expected)
545+
546+
547+
@pytest.mark.parametrize("use_nan", [True, False])
548+
def test_rank_pct_equal_values_on_group_transition(use_nan):
549+
# GH#40518
550+
fill_value = np.nan if use_nan else 3
551+
df = DataFrame(
552+
[
553+
[-1, 1],
554+
[-1, 2],
555+
[1, fill_value],
556+
[-1, fill_value],
557+
],
558+
columns=["group", "val"],
559+
)
560+
result = df.groupby(["group"])["val"].rank(
561+
method="dense",
562+
pct=True,
563+
)
564+
if use_nan:
565+
expected = Series([0.5, 1, np.nan, np.nan], name="val")
566+
else:
567+
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
568+
569+
tm.assert_series_equal(result, expected)

pandas/tests/plotting/frame/test_frame.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -746,7 +746,9 @@ def test_plot_scatter_with_categorical_data(self, x, y):
746746

747747
_check_plot_works(df.plot.scatter, x=x, y=y)
748748

749-
def test_plot_scatter_with_c(self):
749+
def test_plot_scatter_with_c(self, request):
750+
from pandas.plotting._matplotlib.compat import mpl_ge_3_4_0
751+
750752
df = DataFrame(
751753
np.random.randn(6, 4),
752754
index=list(string.ascii_letters[:6]),
@@ -758,9 +760,10 @@ def test_plot_scatter_with_c(self):
758760
# default to Greys
759761
assert ax.collections[0].cmap.name == "Greys"
760762

761-
# n.b. there appears to be no public method
762-
# to get the colorbar label
763-
assert ax.collections[0].colorbar._label == "z"
763+
if mpl_ge_3_4_0():
764+
assert ax.collections[0].colorbar.ax.get_ylabel() == "z"
765+
else:
766+
assert ax.collections[0].colorbar._label == "z"
764767

765768
cm = "cubehelix"
766769
ax = df.plot.scatter(x="x", y="y", c="z", colormap=cm)

0 commit comments

Comments
 (0)