Skip to content

Commit 452e992

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-predicates-nan-propagation
2 parents ddd531a + 160b3eb commit 452e992

File tree

93 files changed

+1443
-915
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+1443
-915
lines changed

.github/workflows/unit-tests.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ jobs:
380380
fetch-depth: 0
381381

382382
- name: Set up Python Free-threading Version
383-
uses: deadsnakes/action@v3.1.0
383+
uses: deadsnakes/action@v3.2.0
384384
with:
385385
python-version: 3.13-dev
386386
nogil: true

ci/code_checks.sh

-32
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7070
--format=actions \
7171
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
73-
-i "pandas.NA SA01" \
7473
-i "pandas.Period.freq GL08" \
7574
-i "pandas.Period.ordinal GL08" \
76-
-i "pandas.Period.to_timestamp SA01" \
77-
-i "pandas.PeriodDtype.freq SA01" \
7875
-i "pandas.RangeIndex.from_range PR01,SA01" \
79-
-i "pandas.RangeIndex.start SA01" \
8076
-i "pandas.RangeIndex.step SA01" \
81-
-i "pandas.RangeIndex.stop SA01" \
8277
-i "pandas.Series.cat.add_categories PR01,PR02" \
8378
-i "pandas.Series.cat.as_ordered PR01" \
8479
-i "pandas.Series.cat.as_unordered PR01" \
@@ -93,10 +88,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
9388
-i "pandas.Series.dt.floor PR01,PR02" \
9489
-i "pandas.Series.dt.freq GL08" \
9590
-i "pandas.Series.dt.month_name PR01,PR02" \
96-
-i "pandas.Series.dt.nanoseconds SA01" \
9791
-i "pandas.Series.dt.normalize PR01" \
9892
-i "pandas.Series.dt.round PR01,PR02" \
99-
-i "pandas.Series.dt.seconds SA01" \
10093
-i "pandas.Series.dt.strftime PR01,PR02" \
10194
-i "pandas.Series.dt.to_period PR01,PR02" \
10295
-i "pandas.Series.dt.total_seconds PR01" \
@@ -108,39 +101,24 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
108101
-i "pandas.Series.sparse.from_coo PR07,SA01" \
109102
-i "pandas.Series.sparse.npoints SA01" \
110103
-i "pandas.Series.sparse.sp_values SA01" \
111-
-i "pandas.Timedelta.components SA01" \
112104
-i "pandas.Timedelta.max PR02" \
113105
-i "pandas.Timedelta.min PR02" \
114106
-i "pandas.Timedelta.resolution PR02" \
115107
-i "pandas.Timedelta.to_timedelta64 SA01" \
116-
-i "pandas.Timedelta.total_seconds SA01" \
117-
-i "pandas.Timedelta.view SA01" \
118-
-i "pandas.TimedeltaIndex.nanoseconds SA01" \
119-
-i "pandas.TimedeltaIndex.seconds SA01" \
120108
-i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
121109
-i "pandas.Timestamp.max PR02" \
122110
-i "pandas.Timestamp.min PR02" \
123111
-i "pandas.Timestamp.nanosecond GL08" \
124112
-i "pandas.Timestamp.resolution PR02" \
125113
-i "pandas.Timestamp.tzinfo GL08" \
126114
-i "pandas.Timestamp.year GL08" \
127-
-i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
128-
-i "pandas.api.types.is_bool PR01,SA01" \
129-
-i "pandas.api.types.is_categorical_dtype SA01" \
130-
-i "pandas.api.types.is_complex PR01,SA01" \
131-
-i "pandas.api.types.is_complex_dtype SA01" \
132-
-i "pandas.api.types.is_datetime64_dtype SA01" \
133-
-i "pandas.api.types.is_datetime64_ns_dtype SA01" \
134-
-i "pandas.api.types.is_datetime64tz_dtype SA01" \
135115
-i "pandas.api.types.is_dict_like PR07,SA01" \
136-
-i "pandas.api.types.is_extension_array_dtype SA01" \
137116
-i "pandas.api.types.is_file_like PR07,SA01" \
138117
-i "pandas.api.types.is_float PR01,SA01" \
139118
-i "pandas.api.types.is_float_dtype SA01" \
140119
-i "pandas.api.types.is_hashable PR01,RT03,SA01" \
141120
-i "pandas.api.types.is_int64_dtype SA01" \
142121
-i "pandas.api.types.is_integer PR01,SA01" \
143-
-i "pandas.api.types.is_integer_dtype SA01" \
144122
-i "pandas.api.types.is_interval_dtype SA01" \
145123
-i "pandas.api.types.is_iterator PR07,SA01" \
146124
-i "pandas.api.types.is_list_like SA01" \
@@ -152,7 +130,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
152130
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
153131
-i "pandas.arrays.BooleanArray SA01" \
154132
-i "pandas.arrays.DatetimeArray SA01" \
155-
-i "pandas.arrays.FloatingArray SA01" \
156133
-i "pandas.arrays.IntegerArray SA01" \
157134
-i "pandas.arrays.IntervalArray.left SA01" \
158135
-i "pandas.arrays.IntervalArray.length SA01" \
@@ -165,35 +142,27 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
165142
-i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \
166143
-i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \
167144
-i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
168-
-i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \
169145
-i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
170146
-i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
171147
-i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
172148
-i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
173-
-i "pandas.core.groupby.DataFrameGroupBy.max SA01" \
174-
-i "pandas.core.groupby.DataFrameGroupBy.min SA01" \
175149
-i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
176150
-i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
177151
-i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \
178152
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
179153
-i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
180-
-i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \
181154
-i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
182155
-i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
183156
-i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
184-
-i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \
185157
-i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
186158
-i "pandas.core.groupby.SeriesGroupBy.groups SA01" \
187159
-i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
188160
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
189161
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
190-
-i "pandas.core.groupby.SeriesGroupBy.max SA01" \
191-
-i "pandas.core.groupby.SeriesGroupBy.min SA01" \
192162
-i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
193163
-i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \
194164
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
195165
-i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
196-
-i "pandas.core.groupby.SeriesGroupBy.sum SA01" \
197166
-i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
198167
-i "pandas.core.resample.Resampler.ffill RT03" \
199168
-i "pandas.core.resample.Resampler.get_group RT03,SA01" \
@@ -222,7 +191,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
222191
-i "pandas.errors.IntCastingNaNError SA01" \
223192
-i "pandas.errors.InvalidIndexError SA01" \
224193
-i "pandas.errors.InvalidVersion SA01" \
225-
-i "pandas.errors.MergeError SA01" \
226194
-i "pandas.errors.NullFrequencyError SA01" \
227195
-i "pandas.errors.NumExprClobberingError SA01" \
228196
-i "pandas.errors.NumbaUtilError SA01" \

doc/source/whatsnew/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Version 2.2
3232
.. toctree::
3333
:maxdepth: 2
3434

35+
v2.2.3
3536
v2.2.2
3637
v2.2.1
3738
v2.2.0

doc/source/whatsnew/v2.2.3.rst

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
.. _whatsnew_223:
2+
3+
What's new in 2.2.3 (September XX, 2024)
4+
----------------------------------------
5+
6+
These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog
7+
including other versions of pandas.
8+
9+
{{ header }}
10+
11+
.. ---------------------------------------------------------------------------
12+
.. _whatsnew_223.regressions:
13+
14+
Fixed regressions
15+
~~~~~~~~~~~~~~~~~
16+
-
17+
18+
.. ---------------------------------------------------------------------------
19+
.. _whatsnew_223.bug_fixes:
20+
21+
Bug fixes
22+
~~~~~~~~~
23+
-
24+
25+
.. ---------------------------------------------------------------------------
26+
.. _whatsnew_223.other:
27+
28+
Other
29+
~~~~~
30+
-
31+
32+
.. ---------------------------------------------------------------------------
33+
.. _whatsnew_223.contributors:
34+
35+
Contributors
36+
~~~~~~~~~~~~

doc/source/whatsnew/v2.3.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,11 @@ Conversion
102102

103103
Strings
104104
^^^^^^^
105+
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
105106
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
107+
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
106108
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
107-
109+
-
108110

109111
Interval
110112
^^^^^^^^

doc/source/whatsnew/v3.0.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
58+
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
5859
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5960
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
6061
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
@@ -626,6 +627,7 @@ I/O
626627
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
627628
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
628629
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
630+
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
629631
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
630632
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
631633
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
@@ -668,6 +670,7 @@ Reshaping
668670
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
669671
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
670672
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
673+
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
671674
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
672675

673676
Sparse

pandas/_libs/lib.pyx

+37-6
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool:
600600
if not array_equivalent(x, y):
601601
return False
602602

603+
elif PyArray_Check(x) or PyArray_Check(y):
604+
return False
603605
elif (x is C_NA) ^ (y is C_NA):
604606
return False
605607
elif not (
@@ -733,7 +735,9 @@ cpdef ndarray[object] ensure_string_array(
733735
convert_na_value : bool, default True
734736
If False, existing na values will be used unchanged in the new array.
735737
copy : bool, default True
736-
Whether to ensure that a new array is returned.
738+
Whether to ensure that a new array is returned. When True, a new array
739+
is always returned. When False, a new array is only returned when needed
740+
to avoid mutating the input array.
737741
skipna : bool, default True
738742
Whether or not to coerce nulls to their stringified form
739743
(e.g. if False, NaN becomes 'nan').
@@ -762,11 +766,15 @@ cpdef ndarray[object] ensure_string_array(
762766

763767
result = np.asarray(arr, dtype="object")
764768

765-
if copy and (result is arr or np.shares_memory(arr, result)):
766-
# GH#54654
767-
result = result.copy()
768-
elif not copy and result is arr:
769-
already_copied = False
769+
if result is arr or np.may_share_memory(arr, result):
770+
# if np.asarray(..) did not make a copy of the input arr, we still need
771+
# to do that to avoid mutating the input array
772+
# GH#54654: share_memory check is needed for rare cases where np.asarray
773+
# returns a new object without making a copy of the actual data
774+
if copy:
775+
result = result.copy()
776+
else:
777+
already_copied = False
770778
elif not copy and not result.flags.writeable:
771779
# Weird edge case where result is a view
772780
already_copied = False
@@ -1123,10 +1131,21 @@ def is_bool(obj: object) -> bool:
11231131
"""
11241132
Return True if given object is boolean.
11251133

1134+
Parameters
1135+
----------
1136+
obj : object
1137+
Object to check.
1138+
11261139
Returns
11271140
-------
11281141
bool
11291142

1143+
See Also
1144+
--------
1145+
api.types.is_scalar : Check if the input is a scalar.
1146+
api.types.is_integer : Check if the input is an integer.
1147+
api.types.is_float : Check if the input is a float.
1148+
11301149
Examples
11311150
--------
11321151
>>> pd.api.types.is_bool(True)
@@ -1142,10 +1161,22 @@ def is_complex(obj: object) -> bool:
11421161
"""
11431162
Return True if given object is complex.
11441163

1164+
Parameters
1165+
----------
1166+
obj : object
1167+
Object to check.
1168+
11451169
Returns
11461170
-------
11471171
bool
11481172

1173+
See Also
1174+
--------
1175+
api.types.is_complex_dtype: Check whether the provided array or
1176+
dtype is of a complex dtype.
1177+
api.types.is_number: Check if the object is a number.
1178+
api.types.is_integer: Return True if given object is integer.
1179+
11491180
Examples
11501181
--------
11511182
>>> pd.api.types.is_complex(1 + 1j)

pandas/_libs/missing.pyx

+8
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,14 @@ class NAType(C_NAType):
347347
The NA singleton is a missing value indicator defined by pandas. It is
348348
used in certain new extension dtypes (currently the "string" dtype).
349349
350+
See Also
351+
--------
352+
numpy.nan : Floating point representation of Not a Number (NaN) for numerical data.
353+
isna : Detect missing values for an array-like object.
354+
notna : Detect non-missing values for an array-like object.
355+
DataFrame.fillna : Fill missing values in a DataFrame.
356+
Series.fillna : Fill missing values in a Series.
357+
350358
Examples
351359
--------
352360
>>> pd.NA

pandas/_libs/tslibs/nattype.pyx

+8
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,14 @@ class NaTType(_NaT):
493493
"""
494494
Total seconds in the duration.
495495
496+
This method calculates the total duration in seconds by combining
497+
the days, seconds, and microseconds of the `Timedelta` object.
498+
499+
See Also
500+
--------
501+
to_timedelta : Convert argument to timedelta.
502+
Timedelta : Represents a duration, the difference between two dates or times.
503+
496504
Examples
497505
--------
498506
>>> td = pd.Timedelta('1min')

pandas/_libs/tslibs/period.pyx

+6
Original file line numberDiff line numberDiff line change
@@ -2001,6 +2001,12 @@ cdef class _Period(PeriodMixin):
20012001
-------
20022002
Timestamp
20032003

2004+
See Also
2005+
--------
2006+
Timestamp : A class representing a single point in time.
2007+
Period : Represents a span of time with a fixed frequency.
2008+
PeriodIndex.to_timestamp : Convert a `PeriodIndex` to a `DatetimeIndex`.
2009+
20042010
Examples
20052011
--------
20062012
>>> period = pd.Period('2023-1-1', freq='D')

pandas/_libs/tslibs/timedeltas.pyx

+34
Original file line numberDiff line numberDiff line change
@@ -1189,6 +1189,14 @@ cdef class _Timedelta(timedelta):
11891189
"""
11901190
Total seconds in the duration.
11911191

1192+
This method calculates the total duration in seconds by combining
1193+
the days, seconds, and microseconds of the `Timedelta` object.
1194+
1195+
See Also
1196+
--------
1197+
to_timedelta : Convert argument to timedelta.
1198+
Timedelta : Represents a duration, the difference between two dates or times.
1199+
11921200
Examples
11931201
--------
11941202
>>> td = pd.Timedelta('1min')
@@ -1458,11 +1466,26 @@ cdef class _Timedelta(timedelta):
14581466
"""
14591467
Array view compatibility.
14601468
1469+
This method allows you to reinterpret the underlying data of a Timedelta
1470+
object as a different dtype. The `view` method provides a way to reinterpret
1471+
the internal representation of the `Timedelta` object without modifying its
1472+
data. This is particularly useful when you need to work with the underlying
1473+
data directly, such as for performance optimizations or interfacing with
1474+
low-level APIs. The returned value is typically the number of nanoseconds
1475+
since the epoch, represented as an integer or another specified dtype.
1476+
14611477
Parameters
14621478
----------
14631479
dtype : str or dtype
14641480
The dtype to view the underlying data as.
14651481
1482+
See Also
1483+
--------
1484+
numpy.ndarray.view : Returns a view of an array with the same data.
1485+
Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64.
1486+
Timedelta.total_seconds : Returns the total duration of the Timedelta
1487+
object in seconds.
1488+
14661489
Examples
14671490
--------
14681491
>>> td = pd.Timedelta('3D')
@@ -1478,6 +1501,17 @@ cdef class _Timedelta(timedelta):
14781501
"""
14791502
Return a components namedtuple-like.
14801503
1504+
Each component represents a different time unit, allowing you to access the
1505+
breakdown of the total duration in terms of days, hours, minutes, seconds,
1506+
milliseconds, microseconds, and nanoseconds.
1507+
1508+
See Also
1509+
--------
1510+
Timedelta.total_seconds : Returns the total duration of the Timedelta in
1511+
seconds.
1512+
to_timedelta : Convert argument to Timedelta.
1513+
Timedelta : Represents a duration, the difference between two dates or times.
1514+
14811515
Examples
14821516
--------
14831517
>>> td = pd.Timedelta('2 day 4 min 3 us 42 ns')

0 commit comments

Comments
 (0)