diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 97e07a76b9149..7a55dd69ba7b7 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -219,7 +219,7 @@ def box_expected(expected, box_cls, transpose=True): else: expected = pd.array(expected) elif box_cls is Index: - expected = Index(expected) + expected = Index._with_infer(expected) elif box_cls is Series: expected = Series(expected) elif box_cls is DataFrame: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b46679c2fca18..1686d69bfcb61 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2031,7 +2031,9 @@ def _validate_listlike(self, value): from pandas import Index # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 - to_add = Index(value, tupleize_cols=False).difference(self.categories) + to_add = Index._with_infer(value, tupleize_cols=False).difference( + self.categories + ) # no assignments of values not in categories, but it's always ok to set # something to np.nan @@ -2741,6 +2743,7 @@ def factorize_from_iterable(values) -> tuple[np.ndarray, Index]: # as values but its codes are by def [0, ..., len(n_categories) - 1] cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) cat = Categorical.from_codes(cat_codes, dtype=values.dtype) + categories = CategoricalIndex(cat) codes = values.codes else: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index dd45029336f63..41998218acd7d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -16,7 +16,10 @@ from pandas._config import get_option -from pandas._libs import NaT +from pandas._libs import ( + NaT, + lib, +) from pandas._libs.interval import ( VALID_CLOSED, Interval, @@ -225,6 +228,9 @@ def __new__( left, right, infer_closed = intervals_to_interval_bounds( data, validate_closed=closed is None ) + if left.dtype == object: + left = lib.maybe_convert_objects(left) + right = lib.maybe_convert_objects(right) closed = closed or infer_closed return cls._simple_new( diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 51b0b746cadf9..a214371bdf26e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -529,7 +529,7 @@ def validate_categories(categories, fastpath: bool = False) -> Index: f"Parameter 'categories' must be list-like, was {repr(categories)}" ) elif not isinstance(categories, ABCIndex): - categories = Index(categories, tupleize_cols=False) + categories = Index._with_infer(categories, tupleize_cols=False) if not fastpath: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 06a70b5786a4d..75a5f37003500 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -455,7 +455,7 @@ def _get_index() -> Index: if self.grouper.nkeys > 1: index = MultiIndex.from_tuples(keys, names=self.grouper.names) else: - index = Index(keys, name=self.grouper.names[0]) + index = Index._with_infer(keys, name=self.grouper.names[0]) return index if isinstance(values[0], dict): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 76815d780a1ad..260416576d79e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -646,7 +646,7 @@ def group_index(self) -> Index: return self._group_index uniques = self._codes_and_uniques[1] - return Index(uniques, name=self.name) + return Index._with_infer(uniques, name=self.name) @cache_readonly def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 87c50e94deb34..5f0d8290b2ffa 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -471,7 +471,9 @@ def __new__( arr = com.asarray_tuplesafe(data, dtype=np.dtype("object")) if dtype is None: - arr = _maybe_cast_data_without_dtype(arr) + arr = _maybe_cast_data_without_dtype( + arr, cast_numeric_deprecated=True + ) dtype = arr.dtype if kwargs: @@ -504,6 +506,15 @@ def __new__( # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=np.dtype("object")) + if dtype is None: + # with e.g. a list [1, 2, 3] casting to numeric is _not_ deprecated + # error: Incompatible types in assignment (expression has type + # "Union[ExtensionArray, ndarray[Any, Any]]", variable has type + # "ndarray[Any, Any]") + subarr = _maybe_cast_data_without_dtype( # type: ignore[assignment] + subarr, cast_numeric_deprecated=False + ) + dtype = subarr.dtype return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) @classmethod @@ -637,6 +648,26 @@ def _simple_new(cls: type[_IndexT], values, name: Hashable = None) -> _IndexT: return result + @classmethod + def _with_infer(cls, *args, **kwargs): + """ + Constructor that uses the 1.0.x behavior inferring numeric dtypes + for ndarray[object] inputs. + """ + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning) + result = cls(*args, **kwargs) + + if result.dtype == object and not result._is_multi: + # error: Argument 1 to "maybe_convert_objects" has incompatible type + # "Union[ExtensionArray, ndarray[Any, Any]]"; expected + # "ndarray[Any, Any]" + values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type] + if values.dtype.kind in ["i", "u", "f"]: + return Index(values, name=result.name) + + return result + @cache_readonly def _constructor(self: _IndexT) -> type[_IndexT]: return type(self) @@ -2609,7 +2640,7 @@ def fillna(self, value=None, downcast=None): if downcast is None: # no need to care metadata other than name # because it can't have freq if - return Index(result, name=self.name) + return Index._with_infer(result, name=self.name) return self._view() def dropna(self: _IndexT, how: str_t = "any") -> _IndexT: @@ -4000,7 +4031,7 @@ def _reindex_non_unique( if isinstance(self, ABCMultiIndex): new_index = type(self).from_tuples(new_labels, names=self.names) else: - new_index = Index(new_labels, name=self.name) + new_index = Index._with_infer(new_labels, name=self.name) return new_index, indexer, new_indexer # -------------------------------------------------------------------- @@ -4441,9 +4472,12 @@ def _wrap_joined_index(self: _IndexT, joined: ArrayLike, other: _IndexT) -> _Ind if isinstance(self, ABCMultiIndex): name = self.names if self.names == other.names else None + # error: Incompatible return value type (got "MultiIndex", + # expected "_IndexT") + return self._constructor(joined, name=name) # type: ignore[return-value] else: name = get_op_result_name(self, other) - return self._constructor(joined, name=name) + return self._constructor._with_infer(joined, name=name) # -------------------------------------------------------------------- # Uncategorized Methods @@ -4796,7 +4830,7 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: to_concat_vals = [x._values for x in to_concat] result = concat_compat(to_concat_vals) - return Index(result, name=name) + return Index._with_infer(result, name=name) def putmask(self, mask, value) -> Index: """ @@ -5743,7 +5777,7 @@ def map(self, mapper, na_action=None): ): return self._constructor(new_values, **attributes) - return Index(new_values, **attributes) + return Index._with_infer(new_values, **attributes) # TODO: De-duplicate with map, xref GH#32349 @final @@ -6219,7 +6253,7 @@ def insert(self, loc: int, item) -> Index: # Use Index constructor to ensure we get tuples cast correctly. item = Index([item], dtype=self.dtype)._values idx = np.concatenate((arr[:loc], item, arr[loc:])) - return Index(idx, name=self.name) + return Index._with_infer(idx, name=self.name) def drop(self, labels, errors: str_t = "raise") -> Index: """ @@ -6304,8 +6338,8 @@ def _arith_method(self, other, op): result = op(Series(self), other) if isinstance(result, tuple): - return (Index(result[0]), Index(result[1])) - return Index(result) + return (Index._with_infer(result[0]), Index(result[1])) + return Index._with_infer(result) @final def _unary_method(self, op): @@ -6628,7 +6662,7 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind if isinstance(index_like, ABCSeries): name = index_like.name - return Index(index_like, name=name, copy=copy) + return Index._with_infer(index_like, name=name, copy=copy) if is_iterator(index_like): index_like = list(index_like) @@ -6644,10 +6678,9 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind return MultiIndex.from_arrays(index_like) else: - return Index(index_like, copy=copy, tupleize_cols=False) + return Index._with_infer(index_like, copy=copy, tupleize_cols=False) else: - - return Index(index_like, copy=copy) + return Index._with_infer(index_like, copy=copy) def ensure_has_len(seq): @@ -6708,7 +6741,17 @@ def maybe_extract_name(name, obj, cls) -> Hashable: return name -def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: +_cast_depr_msg = ( + "In a future version, passing an object-dtype arraylike to pd.Index will " + "not infer numeric values to numeric dtype (matching the Series behavior). " + "To retain the old behavior, explicitly pass the desired dtype or use the " + "desired Index subclass" +) + + +def _maybe_cast_data_without_dtype( + subarr: np.ndarray, cast_numeric_deprecated: bool = True +) -> ArrayLike: """ If we have an arraylike input but no passed dtype, try to infer a supported dtype. @@ -6716,6 +6759,8 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: Parameters ---------- subarr : np.ndarray[object] + cast_numeric_deprecated : bool, default True + Whether to issue a FutureWarning when inferring numeric dtypes. Returns ------- @@ -6730,6 +6775,17 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: convert_interval=True, dtype_if_all_nat=np.dtype("datetime64[ns]"), ) + if result.dtype.kind in ["i", "u", "f"]: + if not cast_numeric_deprecated: + # i.e. we started with a list, not an ndarray[object] + return result + + warnings.warn( + "In a future version, the Index constructor will not infer numeric " + "dtypes when passed object-dtype sequences (matching Series behavior)", + FutureWarning, + stacklevel=3, + ) if result.dtype.kind in ["b", "c"]: return subarr result = ensure_wrapped_if_datetimelike(result) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e5aa8e95e23de..5810590ac640b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2150,7 +2150,7 @@ def append(self, other): try: return MultiIndex.from_tuples(new_tuples, names=self.names) except (TypeError, IndexError): - return Index(new_tuples) + return Index._with_infer(new_tuples) def argsort(self, *args, **kwargs) -> np.ndarray: return self._values.argsort(*args, **kwargs) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index ef01602be7654..55a55d0111397 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -322,7 +322,7 @@ def cons_row(x): out = out.get_level_values(0) return out else: - return Index(result, name=name) + return Index._with_infer(result, name=name) else: index = self._orig.index # This is a mess. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 26349a3b2c6c1..8eac5f76fd455 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -226,7 +226,7 @@ def _box_as_indexlike( if is_datetime64_dtype(dt_array): tz = "utc" if utc else None return DatetimeIndex(dt_array, tz=tz, name=name) - return Index(dt_array, name=name) + return Index(dt_array, name=name, dtype=dt_array.dtype) def _convert_and_box_cache( @@ -517,7 +517,7 @@ def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index: """ to_datetime specalized to the case where a 'unit' is passed. """ - arg = getattr(arg, "_values", arg) + arg = getattr(arg, "_values", arg) # TODO: extract_array # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime # because it expects an ndarray argument @@ -529,7 +529,7 @@ def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index: if errors == "ignore": # Index constructor _may_ infer to DatetimeIndex - result = Index(arr, name=name) + result = Index._with_infer(arr, name=name) else: result = DatetimeIndex(arr, name=name) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index aa8ec157265ce..02899bac14bb2 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -329,7 +329,9 @@ def _hash_ndarray( ) codes, categories = factorize(vals, sort=False) - cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) + cat = Categorical( + codes, Index._with_infer(categories), ordered=False, fastpath=True + ) return _hash_categorical(cat, encoding, hash_key) try: diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index e3f59205aa07c..f8b1ea2ebde23 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -88,7 +88,7 @@ def test_astype_index(all_data, dropna): other = all_data dtype = all_data.dtype - idx = pd.Index(np.array(other)) + idx = pd.Index._with_infer(np.array(other)) assert isinstance(idx, ABCIndex) result = idx.astype(dtype) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 4c8dc6ca1ad9c..db0190d488d42 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, uniques = pd.factorize(data_for_grouping, sort=True) if as_index: - index = pd.Index(uniques, name="B") + index = pd.Index._with_infer(uniques, name="B") expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A") self.assert_series_equal(result, expected) else: @@ -53,7 +53,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping): result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) - index = pd.Index(index, name="B") + index = pd.Index._with_infer(index, name="B") expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A") self.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 9d778cdee6a5b..5f769946335ed 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1067,7 +1067,7 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value): result = getattr(df, op)() expected = DataFrame( {"value": expected_value}, - index=Index([100, 200], dtype="object", name="ID"), + index=Index([100, 200], name="ID"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 84ec43b5d38d7..3ae11847cc06b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1126,7 +1126,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): # https://github.com/pandas-dev/pandas/issues/32219 output = 0.5 if function == "var" else 1.5 arr = np.array([output] * 3, dtype=float) - idx = Index([1, 2, 3], dtype=object, name="a") + idx = Index([1, 2, 3], name="a") expected = DataFrame({"b": arr}, index=idx).astype("Float64") groups = DataFrame(values, dtype="Int64").groupby("a") @@ -1146,7 +1146,7 @@ def test_groupby_sum_below_mincount_nullable_integer(): # https://github.com/pandas-dev/pandas/issues/32861 df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") grouped = df.groupby("a") - idx = Index([0, 1, 2], dtype=object, name="a") + idx = Index([0, 1, 2], name="a") result = grouped["b"].sum(min_count=2) expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index e7dd547b3e73e..2e948c5aa0211 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -136,9 +136,10 @@ def test_constructor_coerce(self, mixed_index, float_index): self.check_coerce(mixed_index, Index([1.5, 2, 3, 4, 5])) self.check_coerce(float_index, Index(np.arange(5) * 2.5)) - self.check_coerce( - float_index, Index(np.array(np.arange(5) * 2.5, dtype=object)) - ) + + with tm.assert_produces_warning(FutureWarning, match="will not infer"): + result = Index(np.array(np.arange(5) * 2.5, dtype=object)) + self.check_coerce(float_index, result.astype("float64")) def test_constructor_explicit(self, mixed_index, float_index): @@ -478,11 +479,17 @@ def test_constructor_corner(self, dtype): index_cls = self._index_cls arr = np.array([1, 2, 3, 4], dtype=object) + index = index_cls(arr, dtype=dtype) assert index.values.dtype == index.dtype if dtype == np.int64: + + msg = "will not infer" + with tm.assert_produces_warning(FutureWarning, match=msg): + without_dtype = Index(arr) + exact = True if index_cls is Int64Index else "equiv" - tm.assert_index_equal(index, Index(arr), exact=exact) + tm.assert_index_equal(index, without_dtype, exact=exact) # preventing casting arr = np.array([1, "2", 3, "4"], dtype=object) diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index e306b6e67cf7f..8fb1d7a210cee 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -148,7 +148,8 @@ def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) index = RangeIndex(1, 5) assert index.values.dtype == np.int64 - tm.assert_index_equal(index, Index(arr)) + with tm.assert_produces_warning(FutureWarning, match="will not infer"): + tm.assert_index_equal(index, Index(arr).astype("int64")) # non-int raise Exception with pytest.raises(TypeError, match=r"Wrong type \"): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3ba5835331fe5..bc02bc4e3a33f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -20,6 +20,7 @@ DataFrame, Series, ) +from pandas.core.indexes.api import ensure_index from pandas.io.parsers import read_csv from pandas.io.stata import ( @@ -1174,7 +1175,7 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame: if is_categorical_dtype(ser.dtype): cat = ser._values.remove_unused_categories() if cat.categories.dtype == object: - categories = pd.Index(cat.categories._values) + categories = ensure_index(cat.categories._values) cat = cat.set_categories(categories) from_frame[col] = cat return from_frame diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 82e6c4daf9515..4f69a7f590319 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -329,6 +329,7 @@ def test_groupby_resample_interpolate(): ], names=["volume", "week_starting"], ) + expected = DataFrame( data={ "price": [ diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 0b3689afac764..052aef4ac1bab 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -93,7 +93,7 @@ def test_equals_matching_nas(): left = Series([np.float64("NaN")], dtype=object) right = Series([np.float64("NaN")], dtype=object) assert left.equals(right) - assert Index(left).equals(Index(right)) + assert Index(left, dtype=left.dtype).equals(Index(right, dtype=right.dtype)) assert left.array.equals(right.array) @@ -123,5 +123,5 @@ def test_equals_none_vs_nan(): ser2 = Series([1, np.nan], dtype=object) assert ser.equals(ser2) - assert Index(ser).equals(Index(ser2)) + assert Index(ser, dtype=ser.dtype).equals(Index(ser2, dtype=ser2.dtype)) assert ser.array.equals(ser2.array) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index dbaf723675efd..ec060aa91e383 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -267,6 +267,7 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:passing object-dtype arraylike:FutureWarning") def test_reversed_xor_with_index_returns_index(self): # GH#22092, GH#19792 ser = Series([True, True, False, False]) @@ -281,7 +282,9 @@ def test_reversed_xor_with_index_returns_index(self): tm.assert_index_equal(result, expected) expected = Index.symmetric_difference(idx2, ser) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): result = idx2 ^ ser tm.assert_index_equal(result, expected)