From 94611584cb2b71d00ea97771fc617ef051eb920e Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 01:06:43 -0600 Subject: [PATCH 1/8] add validate_bool_kwargs_from_keywords to methods --- frame.py | 11445 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 11445 insertions(+) create mode 100644 frame.py diff --git a/frame.py b/frame.py new file mode 100644 index 0000000000000..cf6ff527dc8d3 --- /dev/null +++ b/frame.py @@ -0,0 +1,11445 @@ +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +Similar to its R counterpart, data.frame, except providing automatic data +alignment and a host of useful data manipulation methods having to do with the +labeling information +""" +from __future__ import annotations + +import collections +from collections import abc +import datetime +import functools +from io import StringIO +import itertools +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Hashable, + Iterable, + Iterator, + Literal, + Sequence, + cast, + overload, +) +import warnings + +import numpy as np +import numpy.ma as ma + +from pandas._config import get_option + +from pandas._libs import ( + algos as libalgos, + lib, + properties, +) +from pandas._libs.hashtable import duplicated +from pandas._libs.lib import ( + NoDefault, + no_default, +) +from pandas._typing import ( + AggFuncType, + AnyArrayLike, + ArrayLike, + Axes, + Axis, + ColspaceArgType, + CompressionOptions, + Dtype, + DtypeObj, + FilePath, + FillnaOptions, + FloatFormatType, + FormattersType, + Frequency, + IgnoreRaise, + IndexKeyFunc, + IndexLabel, + Level, + NaPosition, + PythonFuncType, + ReadBuffer, + Renamer, + Scalar, + SortKind, + StorageOptions, + Suffixes, + TimedeltaConvertibleTypes, + TimestampConvertibleTypes, + ValueKeyFunc, + WriteBuffer, + npt, +) +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.util._decorators import ( + Appender, + Substitution, + deprecate_kwarg, + deprecate_nonkeyword_arguments, + doc, + rewrite_axis_style_signature, +) +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import ( + validate_ascending, + validate_axis_style_args, + validate_bool_kwarg, + validate_percentile, + validate_bool_kwargs_from_keywords, +) + +from pandas.core.dtypes.cast import ( + LossySetitemError, + can_hold_element, + construct_1d_arraylike_from_scalar, + construct_2d_arraylike_from_scalar, + find_common_type, + infer_dtype_from_scalar, + invalidate_string_dtypes, + maybe_box_native, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + ensure_platform_int, + infer_dtype_from_object, + is_1d_only_ea_dtype, + is_bool_dtype, + is_dataclass, + is_datetime64_any_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_sequence, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core import ( + algorithms, + common as com, + nanops, + ops, +) +from pandas.core.accessor import CachedAccessor +from pandas.core.apply import ( + reconstruct_func, + relabel_result, +) +from pandas.core.array_algos.take import take_2d_multi +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import ( + extract_array, + sanitize_array, + sanitize_masked_array, +) +from pandas.core.generic import NDFrame +from pandas.core.indexers import check_key_length +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + PeriodIndex, + default_index, + ensure_index, + ensure_index_from_sequences, +) +from pandas.core.indexes.multi import ( + MultiIndex, + maybe_droplevels, +) +from pandas.core.indexing import ( + check_bool_indexer, + check_deprecated_indexers, + convert_to_index_sliceable, +) +from pandas.core.internals import ( + ArrayManager, + BlockManager, +) +from pandas.core.internals.construction import ( + arrays_to_mgr, + dataclasses_to_dicts, + dict_to_mgr, + mgr_to_mgr, + ndarray_to_mgr, + nested_data_to_arrays, + rec_array_to_mgr, + reorder_arrays, + to_arrays, + treat_as_nested, +) +from pandas.core.reshape.melt import melt +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import ( + get_group_index, + lexsort_indexer, + nargsort, +) + +from pandas.io.common import get_handle +from pandas.io.formats import ( + console, + format as fmt, +) +from pandas.io.formats.info import ( + INFO_DOCSTRING, + DataFrameInfo, + frame_sub_kwargs, +) +import pandas.plotting + +if TYPE_CHECKING: + + from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg + from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.core.internals import SingleDataManager + from pandas.core.resample import Resampler + + from pandas.io.formats.style import Styler + +# --------------------------------------------------------------------- +# Docstring templates + +_shared_doc_kwargs = { + "axes": "index, columns", + "klass": "DataFrame", + "axes_single_arg": "{0 or 'index', 1 or 'columns'}", + "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index': apply function to each column. + If 1 or 'columns': apply function to each row.""", + "inplace": """ + inplace : bool, default False + If True, performs operation inplace and returns None.""", + "optional_by": """ + by : str or list of str + Name or list of names to sort by. + + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels.""", + "optional_labels": """labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to.""", + "optional_axis": """axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1).""", + "replace_iloc": """ + This differs from updating with ``.loc`` or ``.iloc``, which require + you to specify a location to update with some value.""", +} + +_numeric_only_doc = """numeric_only : bool or None, default None + Include only float, int, boolean data. If None, will attempt to use + everything, then use only numeric data +""" + +_merge_doc = """ +Merge DataFrame or named Series objects with a database-style join. + +A named Series object is treated as a DataFrame with a single named column. + +The join is done on columns or indexes. If joining columns on +columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes +on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. + +.. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + +Parameters +----------%s +right : DataFrame or named Series + Object to merge with. +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + + .. versionadded:: 1.2.0 + +on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. +left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. +right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. +left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. +right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. +sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). +suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. +copy : bool, default True + If False, avoid copy if possible. +indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + +validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + +Returns +------- +DataFrame + A DataFrame of the two merged objects. + +See Also +-------- +merge_ordered : Merge with optional filling/interpolation. +merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices. + +Notes +----- +Support for specifying index levels as the `on`, `left_on`, and +`right_on` parameters was added in version 0.23.0 +Support for merging named Series objects was added in version 0.24.0 + +Examples +-------- +>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [1, 2, 3, 5]}) +>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [5, 6, 7, 8]}) +>>> df1 + lkey value +0 foo 1 +1 bar 2 +2 baz 3 +3 foo 5 +>>> df2 + rkey value +0 foo 5 +1 bar 6 +2 baz 7 +3 foo 8 + +Merge df1 and df2 on the lkey and rkey columns. The value columns have +the default suffixes, _x and _y, appended. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y +0 foo 1 foo 5 +1 foo 1 foo 8 +2 foo 5 foo 5 +3 foo 5 foo 8 +4 bar 2 bar 6 +5 baz 3 baz 7 + +Merge DataFrames df1 and df2 with specified left and right suffixes +appended to any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', +... suffixes=('_left', '_right')) + lkey value_left rkey value_right +0 foo 1 foo 5 +1 foo 1 foo 8 +2 foo 5 foo 5 +3 foo 5 foo 8 +4 bar 2 bar 6 +5 baz 3 baz 7 + +Merge DataFrames df1 and df2, but raise an exception if the DataFrames have +any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) +Traceback (most recent call last): +... +ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 +""" + + +# ----------------------------------------------------------------------- +# DataFrame class + + +class DataFrame(NDFrame, OpsMixin): + """ + Two-dimensional, size-mutable, potentially heterogeneous tabular data. + + Data structure also contains labeled axes (rows and columns). + Arithmetic operations align on both row and column labels. Can be + thought of as a dict-like container for Series objects. The primary + pandas data structure. + + Parameters + ---------- + data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame + Dict can contain Series, arrays, constants, dataclass or list-like objects. If + data is a dict, column order follows insertion-order. If a dict contains Series + which have an index defined, it is aligned by its index. + + .. versionchanged:: 0.25.0 + If data is a list of dicts, column order follows insertion-order. + + index : Index or array-like + Index to use for resulting frame. Will default to RangeIndex if + no indexing information part of input data and no index provided. + columns : Index or array-like + Column labels to use for resulting frame when data does not have them, + defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, + will perform column selection instead. + dtype : dtype, default None + Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool or None, default None + Copy data from inputs. + For dict data, the default of None behaves like ``copy=True``. For DataFrame + or 2d ndarray input, the default of None behaves like ``copy=False``. + If data is a dict containing one or more Series (possibly of different dtypes), + ``copy=False`` will ensure that these inputs are not copied. + + .. versionchanged:: 1.3.0 + + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_table : Read general delimited file into DataFrame. + read_clipboard : Read text from clipboard into DataFrame. + + Notes + ----- + Please reference the :ref:`User Guide ` for more information. + + Examples + -------- + Constructing DataFrame from a dictionary. + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + Notice that the inferred dtype is int64. + + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + To enforce a single dtype: + + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object + + Constructing DataFrame from a dictionary including Series: + + >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} + >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) + col1 col2 + 0 0 NaN + 1 1 NaN + 2 2 2.0 + 3 3 3.0 + + Constructing DataFrame from numpy ndarray: + + >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + ... columns=['a', 'b', 'c']) + >>> df2 + a b c + 0 1 2 3 + 1 4 5 6 + 2 7 8 9 + + Constructing DataFrame from a numpy ndarray that has labeled columns: + + >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) + >>> df3 = pd.DataFrame(data, columns=['c', 'a']) + ... + >>> df3 + c a + 0 3 1 + 1 6 4 + 2 9 7 + + Constructing DataFrame from dataclass: + + >>> from dataclasses import make_dataclass + >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) + >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + x y + 0 0 0 + 1 0 3 + 2 2 3 + """ + + _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set + _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + _accessors: set[str] = {"sparse"} + _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) + _mgr: BlockManager | ArrayManager + + @property + def _constructor(self) -> Callable[..., DataFrame]: + return DataFrame + + _constructor_sliced: Callable[..., Series] = Series + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, + data=None, + index: Axes | None = None, + columns: Axes | None = None, + dtype: Dtype | None = None, + copy: bool | None = None, + ) -> None: + + if data is None: + data = {} + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, DataFrame): + data = data._mgr + + if isinstance(data, (BlockManager, ArrayManager)): + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if index is None and columns is None and dtype is None and not copy: + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + + manager = get_option("mode.data_manager") + + if copy is None: + if isinstance(data, dict): + # retain pre-GH#38939 default behavior + copy = True + elif ( + manager == "array" + and isinstance(data, (np.ndarray, ExtensionArray)) + and data.ndim == 2 + ): + # INFO(ArrayManager) by default copy the 2D input array to get + # contiguous 1D arrays + copy = True + else: + copy = False + + if isinstance(data, (BlockManager, ArrayManager)): + mgr = self._init_mgr( + data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy + ) + + elif isinstance(data, dict): + # GH#38939 de facto copy defaults to False only in non-dict cases + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) + elif isinstance(data, ma.MaskedArray): + import numpy.ma.mrecords as mrecords + + # masked recarray + if isinstance(data, mrecords.MaskedRecords): + mgr = rec_array_to_mgr( + data, + index, + columns, + dtype, + copy, + typ=manager, + ) + warnings.warn( + "Support for MaskedRecords is deprecated and will be " + "removed in a future version. Pass " + "{name: data[name] for name in data.dtype.names} instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + # a masked array + else: + data = sanitize_masked_array(data) + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + typ=manager, + ) + + elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): + if data.dtype.names: + # i.e. numpy structured array + data = cast(np.ndarray, data) + mgr = rec_array_to_mgr( + data, + index, + columns, + dtype, + copy, + typ=manager, + ) + elif getattr(data, "name", None) is not None: + # i.e. Series/Index with non-None name + mgr = dict_to_mgr( + # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no + # attribute "name" + {data.name: data}, # type: ignore[union-attr] + index, + columns, + dtype=dtype, + typ=manager, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + typ=manager, + ) + + # For data is list-like, or Iterable (will consume into list) + elif is_list_like(data): + if not isinstance(data, (abc.Sequence, ExtensionArray)): + if hasattr(data, "__array__"): + # GH#44616 big perf improvement for e.g. pytorch tensor + data = np.asarray(data) + else: + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) + if not isinstance(data, np.ndarray) and treat_as_nested(data): + # exclude ndarray as we may have cast it a few lines above + if columns is not None: + # error: Argument 1 to "ensure_index" has incompatible type + # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray, + # ndarray], Index, Series], Sequence[Any]]" + columns = ensure_index(columns) # type: ignore[arg-type] + arrays, columns, index = nested_data_to_arrays( + # error: Argument 3 to "nested_data_to_arrays" has incompatible + # type "Optional[Collection[Any]]"; expected "Optional[Index]" + data, + columns, + index, # type: ignore[arg-type] + dtype, + ) + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + typ=manager, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + typ=manager, + ) + else: + mgr = dict_to_mgr( + {}, + index, + columns, + dtype=dtype, + typ=manager, + ) + # For data is scalar + else: + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + + # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; + # expected "Union[Union[Union[ExtensionArray, ndarray], + # Index, Series], Sequence[Any]]" + index = ensure_index(index) # type: ignore[arg-type] + # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; + # expected "Union[Union[Union[ExtensionArray, ndarray], + # Index, Series], Sequence[Any]]" + columns = ensure_index(columns) # type: ignore[arg-type] + + if not dtype: + dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) + + # For data is a scalar extension dtype + if isinstance(dtype, ExtensionDtype): + # TODO(EA2D): special case not needed with 2D EAs + + values = [ + construct_1d_arraylike_from_scalar(data, len(index), dtype) + for _ in range(len(columns)) + ] + mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) + else: + arr2d = construct_2d_arraylike_from_scalar( + data, + len(index), + len(columns), + dtype, + copy, + ) + + mgr = ndarray_to_mgr( + arr2d, + index, + columns, + dtype=arr2d.dtype, + copy=False, + typ=manager, + ) + + # ensure correct Manager type according to settings + mgr = mgr_to_mgr(mgr, typ=manager) + + NDFrame.__init__(self, mgr) + + # ---------------------------------------------------------------------- + @validate_bool_kwargs_from_keywords('nan_as_null', 'allow_copy') + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrameXchg: + """ + Return the dataframe exchange object implementing the exchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame exchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the exchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + + from pandas.core.exchange.dataframe import PandasDataFrameXchg + + return PandasDataFrameXchg(self, nan_as_null, allow_copy) + + # ---------------------------------------------------------------------- + + @property + def axes(self) -> list[Index]: + """ + Return a list representing the axes of the DataFrame. + + It has the row axis labels and column axis labels as the only members. + They are returned in that order. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.axes + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], + dtype='object')] + """ + return [self.index, self.columns] + + @property + def shape(self) -> tuple[int, int]: + """ + Return a tuple representing the dimensionality of the DataFrame. + + See Also + -------- + ndarray.shape : Tuple of array dimensions. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.shape + (2, 2) + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], + ... 'col3': [5, 6]}) + >>> df.shape + (2, 3) + """ + return len(self.index), len(self.columns) + + @property + def _is_homogeneous_type(self) -> bool: + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + See Also + -------- + Index._is_homogeneous_type : Whether the object has a single + dtype. + MultiIndex._is_homogeneous_type : Whether all the levels of a + MultiIndex have the same dtype. + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type + False + + Items with the same type but different sizes are considered + different types. + + >>> DataFrame({ + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type + False + """ + if isinstance(self._mgr, ArrayManager): + return len({arr.dtype for arr in self._mgr.arrays}) == 1 + if self._mgr.any_extension_types: + return len({block.dtype for block in self._mgr.blocks}) == 1 + else: + return not self._is_mixed_type + + @property + def _can_fast_transpose(self) -> bool: + """ + Can we transpose this DataFrame without creating any new array objects. + """ + if isinstance(self._mgr, ArrayManager): + return False + blocks = self._mgr.blocks + if len(blocks) != 1: + return False + + dtype = blocks[0].dtype + # TODO(EA2D) special case would be unnecessary with 2D EAs + return not is_1d_only_ea_dtype(dtype) + + # error: Return type "Union[ndarray, DatetimeArray, TimedeltaArray]" of + # "_values" incompatible with return type "ndarray" in supertype "NDFrame" + @property + def _values( # type: ignore[override] + self, + ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: + """ + Analogue to ._values that may return a 2D ExtensionArray. + """ + self._consolidate_inplace() + + mgr = self._mgr + + if isinstance(mgr, ArrayManager): + if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): + # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" + # has no attribute "reshape" + return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] + return self.values + + blocks = mgr.blocks + if len(blocks) != 1: + return self.values + + arr = blocks[0].values + if arr.ndim == 1: + # non-2D ExtensionArray + return self.values + + # more generally, whatever we allow in NDArrayBackedExtensionBlock + arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) + return arr.T + + # ---------------------------------------------------------------------- + # Rendering Methods + + def _repr_fits_vertical_(self) -> bool: + """ + Check length against max_rows. + """ + max_rows = get_option("display.max_rows") + return len(self) <= max_rows + + @validate_bool_kwargs_from_keywords('ignore_width') + def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: + """ + Check if full repr fits in horizontal boundaries imposed by the display + options width and max_columns. + + In case of non-interactive session, no boundaries apply. + + `ignore_width` is here so ipynb+HTML output can behave the way + users expect. display.max_columns remains in effect. + GH3541, GH3573 + """ + width, height = console.get_console_size() + max_columns = get_option("display.max_columns") + nb_columns = len(self.columns) + + # exceed max columns + if (max_columns and nb_columns > max_columns) or ( + (not ignore_width) and width and nb_columns > (width // 2) + ): + return False + + # used by repr_html under IPython notebook or scripts ignore terminal + # dims + if ignore_width or not console.in_interactive_session(): + return True + + if get_option("display.width") is not None or console.in_ipython_frontend(): + # check at least the column row for excessive width + max_rows = 1 + else: + max_rows = get_option("display.max_rows") + + # when auto-detecting, so width=None and not in ipython front end + # check whether repr fits horizontal by actually checking + # the width of the rendered repr + buf = StringIO() + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + + if max_rows is not None: # unlimited rows + # min of two, where one may be None + d = d.iloc[: min(max_rows, len(d))] + else: + return True + + d.to_string(buf=buf) + value = buf.getvalue() + repr_width = max(len(line) for line in value.split("\n")) + + return repr_width < width + + def _info_repr(self) -> bool: + """ + True if the repr should show the info view. + """ + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) + + def __repr__(self) -> str: + """ + Return a string representation for a particular DataFrame. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + return buf.getvalue() + + repr_params = fmt.get_dataframe_repr_params() + return self.to_string(**repr_params) + + def _repr_html_(self) -> str | None: + """ + Return a html representation for a particular DataFrame. + + Mainly for IPython notebook. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return "
" + val + "
" + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=".", + ) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) + else: + return None + + @overload + def to_string( + self, + buf: None = ..., + columns: Sequence[str] | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | Sequence[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> str: + ... + + @overload + def to_string( + self, + buf: FilePath | WriteBuffer[str], + columns: Sequence[str] | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | Sequence[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> None: + ... + + @Substitution( + header_type="bool or sequence of str", + header="Write out the column names. If a list of strings " + "is given, it is assumed to be aliases for the " + "column names", + col_space_type="int, list or dict of int", + col_space="The minimum width of each column. If a list of ints is given " + "every integers corresponds with one column. If a dict is given, the key " + "references the column, while the value defines the space to use.", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + @validate_bool_kwargs_from_keywords('index', 'index_names', 'show_dimensions') + def to_string( + self, + buf: FilePath | WriteBuffer[str] | None = None, + columns: Sequence[str] | None = None, + col_space: int | list[int] | dict[Hashable, int] | None = None, + header: bool | Sequence[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: fmt.FormattersType | None = None, + float_format: fmt.FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + line_width: int | None = None, + min_rows: int | None = None, + max_colwidth: int | None = None, + encoding: str | None = None, + ) -> str | None: + """ + Render a DataFrame to a console-friendly tabular output. + %(shared_params)s + line_width : int, optional + Width to wrap a line in characters. + min_rows : int, optional + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). + max_colwidth : int, optional + Max width to truncate each column in characters. By default, no limit. + + .. versionadded:: 1.0.0 + encoding : str, default "utf-8" + Set character encoding. + + .. versionadded:: 1.0 + %(returns)s + See Also + -------- + to_html : Convert DataFrame to HTML. + + Examples + -------- + >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} + >>> df = pd.DataFrame(d) + >>> print(df.to_string()) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + """ + from pandas import option_context + + with option_context("display.max_colwidth", max_colwidth): + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + min_rows=min_rows, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + ) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, + line_width=line_width, + ) + + # ---------------------------------------------------------------------- + + @property + def style(self) -> Styler: + """ + Returns a Styler object. + + Contains methods for building a styled HTML representation of the DataFrame. + + See Also + -------- + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. + """ + from pandas.io.formats.style import Styler + + return Styler(self) + + _shared_docs[ + "items" + ] = r""" + Iterate over (column name, Series) pairs. + + Iterates over the DataFrame columns, returning a tuple with + the column name and the content as a Series. + + Yields + ------ + label : object + The column names for the DataFrame being iterated over. + content : Series + The column entries belonging to each label, as a Series. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as + (index, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples + of the values. + + Examples + -------- + >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], + ... 'population': [1864, 22000, 80000]}, + ... index=['panda', 'polar', 'koala']) + >>> df + species population + panda bear 1864 + polar bear 22000 + koala marsupial 80000 + >>> for label, content in df.items(): + ... print(f'label: {label}') + ... print(f'content: {content}', sep='\n') + ... + label: species + content: + panda bear + polar bear + koala marsupial + Name: species, dtype: object + label: population + content: + panda 1864 + polar 22000 + koala 80000 + Name: population, dtype: int64 + """ + + @Appender(_shared_docs["items"]) + def items(self) -> Iterable[tuple[Hashable, Series]]: + if self.columns.is_unique and hasattr(self, "_item_cache"): + for k in self.columns: + yield k, self._get_item_cache(k) + else: + for i, k in enumerate(self.columns): + yield k, self._ixs(i, axis=1) + + @Appender(_shared_docs["items"]) + def iteritems(self) -> Iterable[tuple[Hashable, Series]]: + warnings.warn( + "iteritems is deprecated and will be removed in a future version. " + "Use .items instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + yield from self.items() + + def iterrows(self) -> Iterable[tuple[Hashable, Series]]: + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + + See Also + -------- + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + 1. Because ``iterrows`` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). For example, + + >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + >>> row = next(df.iterrows())[1] + >>> row + int 1.0 + float 1.5 + Name: 0, dtype: float64 + >>> print(row['int'].dtype) + float64 + >>> print(df['int'].dtype) + int64 + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`itertuples` which returns namedtuples of the values + and which is generally faster than ``iterrows``. + + 2. You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect. + """ + columns = self.columns + klass = self._constructor_sliced + for k, v in zip(self.index, self.values): + s = klass(v, index=columns, name=k).__finalize__(self) + yield k, s + @validate_bool_kwargs_from_keywords('index') + def itertuples( + self, index: bool = True, name: str | None = "Pandas" + ) -> Iterable[tuple[Any, ...]]: + """ + Iterate over DataFrame rows as namedtuples. + + Parameters + ---------- + index : bool, default True + If True, return the index as the first element of the tuple. + name : str or None, default "Pandas" + The name of the returned namedtuples or None to return regular + tuples. + + Returns + ------- + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) + pairs. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + The column names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, + ... index=['dog', 'hawk']) + >>> df + num_legs num_wings + dog 4 0 + hawk 2 2 + >>> for row in df.itertuples(): + ... print(row) + ... + Pandas(Index='dog', num_legs=4, num_wings=0) + Pandas(Index='hawk', num_legs=2, num_wings=2) + + By setting the `index` parameter to False we can remove the index + as the first element of the tuple: + + >>> for row in df.itertuples(index=False): + ... print(row) + ... + Pandas(num_legs=4, num_wings=0) + Pandas(num_legs=2, num_wings=2) + + With the `name` parameter set we set a custom name for the yielded + namedtuples: + + >>> for row in df.itertuples(name='Animal'): + ... print(row) + ... + Animal(Index='dog', num_legs=4, num_wings=0) + Animal(Index='hawk', num_legs=2, num_wings=2) + """ + arrays = [] + fields = list(self.columns) + if index: + arrays.append(self.index) + fields.insert(0, "Index") + + # use integer indexing because of possible duplicate column names + arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) + + if name is not None: + # https://github.com/python/mypy/issues/9046 + # error: namedtuple() expects a string literal as the first argument + itertuple = collections.namedtuple( # type: ignore[misc] + name, fields, rename=True + ) + return map(itertuple._make, zip(*arrays)) + + # fallback to regular tuples + return zip(*arrays) + + def __len__(self) -> int: + """ + Returns length of info axis, but here we use the index. + """ + return len(self.index) + + @overload + def dot(self, other: Series) -> Series: + ... + + @overload + def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: + ... + + def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Compute the matrix multiplication between the DataFrame and other. + + This method computes the matrix product between the DataFrame and the + values of an other Series, DataFrame or a numpy array. + + It can also be called using ``self @ other`` in Python >= 3.5. + + Parameters + ---------- + other : Series, DataFrame or array-like + The other object to compute the matrix product with. + + Returns + ------- + Series or DataFrame + If other is a Series, return the matrix product between self and + other as a Series. If other is a DataFrame or a numpy.array, return + the matrix product of self and other in a DataFrame of a np.array. + + See Also + -------- + Series.dot: Similar method for Series. + + Notes + ----- + The dimensions of DataFrame and other must be compatible in order to + compute the matrix multiplication. In addition, the column names of + DataFrame and the index of other must contain the same values, as they + will be aligned prior to the multiplication. + + The dot method for Series computes the inner product, instead of the + matrix product here. + + Examples + -------- + Here we multiply a DataFrame with a Series. + + >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> s = pd.Series([1, 1, 2, 1]) + >>> df.dot(s) + 0 -4 + 1 5 + dtype: int64 + + Here we multiply a DataFrame with another DataFrame. + + >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(other) + 0 1 + 0 1 4 + 1 2 2 + + Note that the dot method give the same result as @ + + >>> df @ other + 0 1 + 0 1 4 + 1 2 2 + + The dot method works also if other is an np.array. + + >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(arr) + 0 1 + 0 1 4 + 1 2 2 + + Note how shuffling of the objects does not change the result. + + >>> s2 = s.reindex([1, 0, 2, 3]) + >>> df.dot(s2) + 0 -4 + 1 5 + dtype: int64 + """ + if isinstance(other, (Series, DataFrame)): + common = self.columns.union(other.index) + if len(common) > len(self.columns) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + left = self.reindex(columns=common, copy=False) + right = other.reindex(index=common, copy=False) + lvals = left.values + rvals = right._values + else: + left = self + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[1] != rvals.shape[0]: + raise ValueError( + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" + ) + + if isinstance(other, DataFrame): + return self._constructor( + np.dot(lvals, rvals), index=left.index, columns=other.columns + ) + elif isinstance(other, Series): + return self._constructor_sliced(np.dot(lvals, rvals), index=left.index) + elif isinstance(rvals, (np.ndarray, Index)): + result = np.dot(lvals, rvals) + if result.ndim == 2: + return self._constructor(result, index=left.index) + else: + return self._constructor_sliced(result, index=left.index) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + @overload + def __matmul__(self, other: Series) -> Series: + ... + + @overload + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + ... + + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ + return self.dot(other) + + def __rmatmul__(self, other): + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ + try: + return self.T.dot(np.transpose(other)).T + except ValueError as err: + if "shape mismatch" not in str(err): + raise + # GH#21581 give exception message for original shapes + msg = f"shapes {np.shape(other)} and {self.shape} not aligned" + raise ValueError(msg) from err + + # ---------------------------------------------------------------------- + # IO methods (to / from other formats) + + @classmethod + def from_dict( + cls, + data, + orient: str = "columns", + dtype: Dtype | None = None, + columns=None, + ) -> DataFrame: + """ + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + orient : {'columns', 'index', 'tight'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + If 'tight', assume a dict with keys ['index', 'columns', 'data', + 'index_names', 'column_names']. + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + dtype : dtype, default None + Data type to force, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'`` or ``orient='tight'``. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from structured ndarray, sequence + of tuples or dicts, or DataFrame. + DataFrame : DataFrame object creation using constructor. + DataFrame.to_dict : Convert the DataFrame to a dictionary. + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data, orient='index') + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the 'index' orientation, the column names can be + specified manually: + + >>> pd.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) + A B C D + row_1 3 2 1 0 + row_2 a b c d + + Specify ``orient='tight'`` to create the DataFrame using a 'tight' + format: + + >>> data = {'index': [('a', 'b'), ('a', 'c')], + ... 'columns': [('x', 1), ('y', 2)], + ... 'data': [[1, 3], [2, 4]], + ... 'index_names': ['n1', 'n2'], + ... 'column_names': ['z1', 'z2']} + >>> pd.DataFrame.from_dict(data, orient='tight') + z1 x y + z2 1 2 + n1 n2 + a b 1 3 + c 2 4 + """ + index = None + orient = orient.lower() + if orient == "index": + if len(data) > 0: + # TODO speed up Series case + if isinstance(list(data.values())[0], (Series, dict)): + data = _from_nested_dict(data) + else: + data, index = list(data.values()), list(data.keys()) + elif orient == "columns" or orient == "tight": + if columns is not None: + raise ValueError(f"cannot use columns parameter with orient='{orient}'") + else: # pragma: no cover + raise ValueError("only recognize index or columns for orient") + + if orient != "tight": + return cls(data, index=index, columns=columns, dtype=dtype) + else: + realdata = data["data"] + + def create_index(indexlist, namelist): + index: Index + if len(namelist) > 1: + index = MultiIndex.from_tuples(indexlist, names=namelist) + else: + index = Index(indexlist, name=namelist[0]) + return index + + index = create_index(data["index"], data["index_names"]) + columns = create_index(data["columns"], data["column_names"]) + return cls(realdata, index=index, columns=columns, dtype=dtype) + + @validate_bool_kwargs_from_keywords('copy') + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the DataFrame to a NumPy array. + + By default, the dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, if the dtypes are + ``float16`` and ``float32``, the results dtype will be ``float32``. + This may require copying data and coercing values, which may be + expensive. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the dtypes of the DataFrame columns. + + .. versionadded:: 1.1.0 + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogeneous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df['C'] = pd.date_range('2000', periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + self._consolidate_inplace() + if dtype is not None: + dtype = np.dtype(dtype) + result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) + if result.dtype is not dtype: + result = np.array(result, dtype=dtype, copy=False) + + return result + + def to_dict(self, orient: str = "dict", into=dict): + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Parameters + ---------- + orient : str {'dict', 'list', 'series', 'split', 'records', 'index'} + Determines the type of the values of the dictionary. + + - 'dict' (default) : dict like {column -> {index -> value}} + - 'list' : dict like {column -> [values]} + - 'series' : dict like {column -> Series(values)} + - 'split' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} + - 'tight' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values], + 'index_names' -> [index.names], 'column_names' -> [column.names]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + + Abbreviations are allowed. `s` indicates `series` and `sp` + indicates `split`. + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + into : class, default dict + The collections.abc.Mapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + Returns + ------- + dict, list or collections.abc.Mapping + Return a collections.abc.Mapping object representing the DataFrame. + The resulting transformation depends on the `orient` parameter. + + See Also + -------- + DataFrame.from_dict: Create a DataFrame from a dictionary. + DataFrame.to_json: Convert a DataFrame to JSON format. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], + ... 'col2': [0.5, 0.75]}, + ... index=['row1', 'row2']) + >>> df + col1 col2 + row1 1 0.50 + row2 2 0.75 + >>> df.to_dict() + {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} + + You can specify the return orientation. + + >>> df.to_dict('series') + {'col1': row1 1 + row2 2 + Name: col1, dtype: int64, + 'col2': row1 0.50 + row2 0.75 + Name: col2, dtype: float64} + + >>> df.to_dict('split') + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]]} + + >>> df.to_dict('records') + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] + + >>> df.to_dict('index') + {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} + + >>> df.to_dict('tight') + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} + + You can also specify the mapping type. + + >>> from collections import OrderedDict, defaultdict + >>> df.to_dict(into=OrderedDict) + OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), + ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) + + If you want a `defaultdict`, you need to initialize it: + + >>> dd = defaultdict(list) + >>> df.to_dict('records', into=dd) + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] + """ + if not self.columns.is_unique: + warnings.warn( + "DataFrame columns are not unique, some columns will be omitted.", + UserWarning, + stacklevel=find_stack_level(), + ) + # GH16122 + into_c = com.standardize_mapping(into) + + orient = orient.lower() + # GH32515 + if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { + "dict", + "list", + "series", + "split", + "records", + "index", + }: + warnings.warn( + "Using short name for 'orient' is deprecated. Only the " + "options: ('dict', list, 'series', 'split', 'records', 'index') " + "will be used in a future version. Use one of the above " + "to silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if orient.startswith("d"): + orient = "dict" + elif orient.startswith("l"): + orient = "list" + elif orient.startswith("sp"): + orient = "split" + elif orient.startswith("s"): + orient = "series" + elif orient.startswith("r"): + orient = "records" + elif orient.startswith("i"): + orient = "index" + + if orient == "dict": + return into_c((k, v.to_dict(into)) for k, v in self.items()) + + elif orient == "list": + return into_c( + (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items() + ) + + elif orient == "split": + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ( + "data", + [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ], + ), + ) + ) + + elif orient == "tight": + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ( + "data", + [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ], + ), + ("index_names", list(self.index.names)), + ("column_names", list(self.columns.names)), + ) + ) + + elif orient == "series": + return into_c((k, v) for k, v in self.items()) + + elif orient == "records": + columns = self.columns.tolist() + rows = ( + dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None) + ) + return [ + into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows + ] + + elif orient == "index": + if not self.index.is_unique: + raise ValueError("DataFrame index must be unique for orient='index'.") + return into_c( + (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) + for t in self.itertuples(name=None) + ) + + else: + raise ValueError(f"orient '{orient}' not understood") + + @validate_bool_kwargs_from_keywords('reauth', 'auth_local_webserver', 'progress_bar') + def to_gbq( + self, + destination_table: str, + project_id: str | None = None, + chunksize: int | None = None, + reauth: bool = False, + if_exists: str = "fail", + auth_local_webserver: bool = True, + table_schema: list[dict[str, str]] | None = None, + location: str | None = None, + progress_bar: bool = True, + credentials=None, + ) -> None: + """ + Write a DataFrame to a Google BigQuery table. + + This function requires the `pandas-gbq package + `__. + + See the `How to authenticate with Google BigQuery + `__ + guide for authentication instructions. + + Parameters + ---------- + destination_table : str + Name of table to be written, in the form ``dataset.tablename``. + project_id : str, optional + Google BigQuery Account project ID. Optional when available from + the environment. + chunksize : int, optional + Number of rows to be inserted in each chunk from the dataframe. + Set to ``None`` to load the whole dataframe at once. + reauth : bool, default False + Force Google BigQuery to re-authenticate the user. This is useful + if multiple accounts are used. + if_exists : str, default 'fail' + Behavior when the destination table exists. Value can be one of: + + ``'fail'`` + If table exists raise pandas_gbq.gbq.TableCreationError. + ``'replace'`` + If table exists, drop it, recreate it, and insert data. + ``'append'`` + If table exists, insert data. Create if does not exist. + auth_local_webserver : bool, default True + Use the `local webserver flow`_ instead of the `console flow`_ + when getting user credentials. + + .. _local webserver flow: + https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + .. _console flow: + https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + + *New in version 0.2.0 of pandas-gbq*. + + .. versionchanged:: 1.5.0 + Default value is changed to ``True``. Google has deprecated the + ``auth_local_webserver = False`` `"out of band" (copy-paste) + flow + `_. + table_schema : list of dicts, optional + List of BigQuery table fields to which according DataFrame + columns conform to, e.g. ``[{'name': 'col1', 'type': + 'STRING'},...]``. If schema is not provided, it will be + generated according to dtypes of DataFrame columns. See + BigQuery API documentation on available names of a field. + + *New in version 0.3.1 of pandas-gbq*. + location : str, optional + Location where the load job should run. See the `BigQuery locations + documentation + `__ for a + list of available locations. The location must match that of the + target dataset. + + *New in version 0.5.0 of pandas-gbq*. + progress_bar : bool, default True + Use the library `tqdm` to show the progress bar for the upload, + chunk by chunk. + + *New in version 0.5.0 of pandas-gbq*. + credentials : google.auth.credentials.Credentials, optional + Credentials for accessing Google APIs. Use this parameter to + override default credentials, such as to use Compute Engine + :class:`google.auth.compute_engine.Credentials` or Service + Account :class:`google.oauth2.service_account.Credentials` + directly. + + *New in version 0.8.0 of pandas-gbq*. + + See Also + -------- + pandas_gbq.to_gbq : This function in the pandas-gbq library. + read_gbq : Read a DataFrame from Google BigQuery. + """ + from pandas.io import gbq + + gbq.to_gbq( + self, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + ) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + """ + Convert structured or record ndarray to DataFrame. + + Creates a DataFrame object from a structured ndarray, sequence of + tuples or dicts, or DataFrame. + + Parameters + ---------- + data : structured ndarray, sequence of tuples or dicts, or DataFrame + Structured input data. + index : str, list of fields, array-like + Field of array to use as the index, alternately a specific set of + input labels to use. + exclude : sequence, default None + Columns or fields to exclude. + columns : sequence, default None + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns). + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + nrows : int, default None + Number of rows to read if data is an iterator. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_dict : DataFrame from dict of array-like or dicts. + DataFrame : DataFrame object creation using constructor. + + Examples + -------- + Data can be provided as a structured ndarray: + + >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], + ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of dicts: + + >>> data = [{'col_1': 3, 'col_2': 'a'}, + ... {'col_1': 2, 'col_2': 'b'}, + ... {'col_1': 1, 'col_2': 'c'}, + ... {'col_1': 0, 'col_2': 'd'}] + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of tuples with corresponding columns: + + >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] + >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + """ + result_index = None + + # Make a copy of the input columns so we can modify it + if columns is not None: + columns = ensure_index(columns) + + def maybe_reorder( + arrays: list[ArrayLike], arr_columns: Index, columns: Index, index + ) -> tuple[list[ArrayLike], Index, Index | None]: + """ + If our desired 'columns' do not match the data's pre-existing 'arr_columns', + we re-order our arrays. This is like a pre-emptive (cheap) reindex. + """ + if len(arrays): + length = len(arrays[0]) + else: + length = 0 + + result_index = None + if len(arrays) == 0 and index is None and length == 0: + # for backward compat use an object Index instead of RangeIndex + result_index = Index([]) + + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) + return arrays, arr_columns, result_index + + if is_iterator(data): + if nrows == 0: + return cls() + + try: + first_row = next(data) + except StopIteration: + return cls(index=index, columns=columns) + + dtype = None + if hasattr(first_row, "dtype") and first_row.dtype.names: + dtype = first_row.dtype + + values = [first_row] + + if nrows is None: + values += data + else: + values.extend(itertools.islice(data, nrows - 1)) + + if dtype is not None: + data = np.array(values, dtype=dtype) + else: + data = values + + if isinstance(data, dict): + if columns is None: + columns = arr_columns = ensure_index(sorted(data)) + arrays = [data[k] for k in columns] + else: + arrays = [] + arr_columns_list = [] + for k, v in data.items(): + if k in columns: + arr_columns_list.append(k) + arrays.append(v) + + arr_columns = Index(arr_columns_list) + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + elif isinstance(data, (np.ndarray, DataFrame)): + arrays, columns = to_arrays(data, columns) + arr_columns = columns + else: + arrays, arr_columns = to_arrays(data, columns) + if coerce_float: + for i, arr in enumerate(arrays): + if arr.dtype == object: + # error: Argument 1 to "maybe_convert_objects" has + # incompatible type "Union[ExtensionArray, ndarray]"; + # expected "ndarray" + arrays[i] = lib.maybe_convert_objects( + arr, # type: ignore[arg-type] + try_float=True, + ) + + arr_columns = ensure_index(arr_columns) + if columns is None: + columns = arr_columns + else: + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + if exclude is None: + exclude = set() + else: + exclude = set(exclude) + + if index is not None: + if isinstance(index, str) or not hasattr(index, "__iter__"): + i = columns.get_loc(index) + exclude.add(index) + if len(arrays) > 0: + result_index = Index(arrays[i], name=index) + else: + result_index = Index([], name=index) + else: + try: + index_data = [arrays[arr_columns.get_loc(field)] for field in index] + except (KeyError, TypeError): + # raised by get_loc, see GH#29258 + result_index = index + else: + result_index = ensure_index_from_sequences(index_data, names=index) + exclude.update(index) + + if any(exclude): + arr_exclude = [x for x in exclude if x in arr_columns] + to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arrays = [v for i, v in enumerate(arrays) if i not in to_remove] + + columns = columns.drop(exclude) + + manager = get_option("mode.data_manager") + mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) + + return cls(mgr) + + def to_records( + self, index=True, column_dtypes=None, index_dtypes=None + ) -> np.recarray: + """ + Convert DataFrame to a NumPy record array. + + Index will be included as the first field of the record array if + requested. + + Parameters + ---------- + index : bool, default True + Include index in resulting record array, stored in 'index' + field or using the index label, if set. + column_dtypes : str, type, dict, default None + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes : str, type, dict, default None + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. + + Returns + ------- + numpy.recarray + NumPy ndarray with the DataFrame labels as fields and each row + of the DataFrame as entries. + + See Also + -------- + DataFrame.from_records: Convert structured or record ndarray + to DataFrame. + numpy.recarray: An ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, + ... index=['a', 'b']) + >>> df + A B + a 1 0.50 + b 2 0.75 + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.index = df.index.rename("I") + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index=False) + rec.array([(1, 0.5 ), (2, 0.75)], + dtype=[('A', '>> df.to_records(column_dtypes={"A": "int32"}) + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=">> index_dtypes = f">> df.to_records(index_dtypes=index_dtypes) + rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], + dtype=[('I', 'S1'), ('A', ' DataFrame: + """ + Create DataFrame from a list of arrays corresponding to the columns. + + Parameters + ---------- + arrays : list-like of arrays + Each array in the list corresponds to one column, in order. + columns : list-like, Index + The column names for the resulting DataFrame. + index : list-like, Index + The rows labels for the resulting DataFrame. + dtype : dtype, optional + Optional dtype to enforce for all arrays. + verify_integrity : bool, default True + Validate and homogenize all input. If set to False, it is assumed + that all elements of `arrays` are actual arrays how they will be + stored in a block (numpy ndarray or ExtensionArray), have the same + length as and are aligned with the index, and that `columns` and + `index` are ensured to be an Index object. + + Returns + ------- + DataFrame + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + manager = get_option("mode.data_manager") + columns = ensure_index(columns) + if len(columns) != len(arrays): + raise ValueError("len(columns) must match len(arrays)") + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + verify_integrity=verify_integrity, + typ=manager, + ) + return cls(mgr) + + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path", + ) + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + @validate_bool_kwargs_from_keywords('write_index') + def to_stata( + self, + path: FilePath | WriteBuffer[bytes], + convert_dates: dict[Hashable, str] | None = None, + write_index: bool = True, + byteorder: str | None = None, + time_stamp: datetime.datetime | None = None, + data_label: str | None = None, + variable_labels: dict[Hashable, str] | None = None, + version: int | None = 114, + convert_strl: Sequence[Hashable] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, + *, + value_labels: dict[Hashable, dict[float | int, str]] | None = None, + ) -> None: + """ + Export DataFrame object to Stata dta format. + + Writes the DataFrame to a Stata dataset file. + "dta" files contain a Stata dataset. + + Parameters + ---------- + path : str, path object, or buffer + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. + + .. versionchanged:: 1.0.0 + + Previously this was "fname" + + convert_dates : dict + Dictionary mapping columns containing datetime types to stata + internal format to use when writing the dates. Options are 'tc', + 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer + or a name. Datetime columns that do not have a conversion type + specified will be converted to 'tc'. Raises NotImplementedError if + a datetime column has timezone information. + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder`. + time_stamp : datetime + A datetime to use as file creation date. Default is the current + time. + data_label : str, optional + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. + version : {{114, 117, 118, 119, None}}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. + + Version 119 should usually only be used when the number of + variables exceeds the capacity of dta format 118. Exporting + smaller datasets in format 119 may have unintended consequences, + and, as of November 2020, Stata SE cannot read version 119 files. + + .. versionchanged:: 1.0.0 + + Added support for formats 118 and 119. + + convert_strl : list, optional + List of column names to convert to string columns to Stata StrL + format. Only available if version is 117. Storing strings in the + StrL format can produce smaller dta files if strings have more than + 8 characters and values are repeated. + {compression_options} + + .. versionadded:: 1.1.0 + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + .. versionadded:: 1.2.0 + + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + + .. versionadded:: 1.4.0 + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + See Also + -------- + read_stata : Import Stata data files. + io.stata.StataWriter : Low-level writer for Stata data files. + io.stata.StataWriter117 : Low-level writer for version 117 files. + + Examples + -------- + >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', + ... 'parrot'], + ... 'speed': [350, 18, 361, 15]}}) + >>> df.to_stata('animals.dta') # doctest: +SKIP + """ + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") + if version == 114: + if convert_strl is not None: + raise ValueError("strl is not supported in format 114") + from pandas.io.stata import StataWriter as statawriter + elif version == 117: + # mypy: Name 'statawriter' already defined (possibly by an import) + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriter117 as statawriter, + ) + else: # versions 118 and 119 + # mypy: Name 'statawriter' already defined (possibly by an import) + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriterUTF8 as statawriter, + ) + + kwargs: dict[str, Any] = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 + kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version + + writer = statawriter( + path, + self, + convert_dates=convert_dates, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + write_index=write_index, + variable_labels=variable_labels, + compression=compression, + storage_options=storage_options, + value_labels=value_labels, + **kwargs, + ) + writer.write_file() + + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: + """ + Write a DataFrame to the binary Feather format. + + Parameters + ---------- + path : str, path object, file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If a string or a path, + it will be used as Root Directory path when writing a partitioned dataset. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + Starting with pyarrow 0.17, this includes the `compression`, + `compression_level`, `chunksize` and `version` keywords. + + .. versionadded:: 1.1.0 + + Notes + ----- + This function writes the dataframe as a `feather file + `_. Requires a default + index. For saving the DataFrame with your custom index use a method that + supports custom indices e.g. `to_parquet`. + """ + from pandas.io.feather_format import to_feather + + to_feather(self, path, **kwargs) + + @doc( + Series.to_markdown, + klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], + examples="""Examples + -------- + >>> df = pd.DataFrame( + ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} + ... ) + >>> print(df.to_markdown()) + | | animal_1 | animal_2 | + |---:|:-----------|:-----------| + | 0 | elk | dog | + | 1 | pig | quetzal | + + Output markdown with a tabulate option. + + >>> print(df.to_markdown(tablefmt="grid")) + +----+------------+------------+ + | | animal_1 | animal_2 | + +====+============+============+ + | 0 | elk | dog | + +----+------------+------------+ + | 1 | pig | quetzal | + +----+------------+------------+""", + ) + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None = None, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions = None, + **kwargs, + ) -> str | None: + if "showindex" in kwargs: + warnings.warn( + "'showindex' is deprecated. Only 'index' will be used " + "in a future version. Use 'index' to silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + kwargs.setdefault("headers", "keys") + kwargs.setdefault("tablefmt", "pipe") + kwargs.setdefault("showindex", index) + tabulate = import_optional_dependency("tabulate") + result = tabulate.tabulate(self, **kwargs) + if buf is None: + return result + + with get_handle(buf, mode, storage_options=storage_options) as handles: + handles.handle.write(result) + return None + + @doc(storage_options=_shared_docs["storage_options"]) + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + def to_parquet( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + engine: str = "auto", + compression: str | None = "snappy", + index: bool | None = None, + partition_cols: list[str] | None = None, + storage_options: StorageOptions = None, + **kwargs, + ) -> bytes | None: + """ + Write a DataFrame to the binary parquet format. + + This function writes the dataframe as a `parquet file + `_. You can choose different parquet + backends, and have the option of compression. See + :ref:`the user guide ` for more details. + + Parameters + ---------- + path : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string or path, it will be used as Root Directory + path when writing a partitioned dataset. + + .. versionchanged:: 1.2.0 + + Previously this was "fname" + + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + partition_cols : list, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + Must be None if path is not a string. + {storage_options} + + .. versionadded:: 1.2.0 + + **kwargs + Additional arguments passed to the parquet library. See + :ref:`pandas io ` for more details. + + Returns + ------- + bytes if no path argument is provided else None + + See Also + -------- + read_parquet : Read a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + This function requires either the `fastparquet + `_ or `pyarrow + `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df.to_parquet('df.parquet.gzip', + ... compression='gzip') # doctest: +SKIP + >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + If you want to get a buffer to the parquet content you can use a io.BytesIO + object, as long as you don't use partition_cols, which creates multiple files. + + >>> import io + >>> f = io.BytesIO() + >>> df.to_parquet(f) + >>> f.seek(0) + 0 + >>> content = f.read() + """ + from pandas.io.parquet import to_parquet + + return to_parquet( + self, + path, + engine, + compression=compression, + index=index, + partition_cols=partition_cols, + storage_options=storage_options, + **kwargs, + ) + + @Substitution( + header_type="bool", + header="Whether to print column labels, default True", + col_space_type="str or int, list or dict of int or str", + col_space="The minimum width of each column in CSS length " + "units. An int is assumed to be px units.\n\n" + " .. versionadded:: 0.25.0\n" + " Ability to use str", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + @validate_bool_kwargs_from_keywords('index', 'index_names', 'bold_rows', 'escape', 'notebook', 'render_links') + def to_html( + self, + buf: FilePath | WriteBuffer[str] | None = None, + columns: Sequence[str] | None = None, + col_space: ColspaceArgType | None = None, + header: bool | Sequence[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: FormattersType | None = None, + float_format: FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool | str = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | bool | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ): + """ + Render a DataFrame as an HTML table. + %(shared_params)s + bold_rows : bool, default True + Make the row labels bold in the output. + classes : str or list or tuple, default None + CSS class(es) to apply to the resulting html table. + escape : bool, default True + Convert the characters <, >, and & to HTML-safe sequences. + notebook : {True, False}, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + `` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + encoding : str, default "utf-8" + Set character encoding. + + .. versionadded:: 1.0 + %(returns)s + See Also + -------- + to_string : Convert DataFrame to a string. + """ + if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: + raise ValueError("Invalid value for justify parameter") + + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + justify=justify, + index_names=index_names, + escape=escape, + decimal=decimal, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + ) + # TODO: a generic formatter wld b in DataFrameFormatter + return fmt.DataFrameRenderer(formatter).to_html( + buf=buf, + classes=classes, + notebook=notebook, + border=border, + encoding=encoding, + table_id=table_id, + render_links=render_links, + ) + + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buffer", + ) + @validate_bool_kwargs_from_keywords('index') + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + index: bool = True, + root_name: str | None = "data", + row_name: str | None = "row", + na_rep: str | None = None, + attr_cols: list[str] | None = None, + elem_cols: list[str] | None = None, + namespaces: dict[str | None, str] | None = None, + prefix: str | None = None, + encoding: str = "utf-8", + xml_declaration: bool | None = True, + pretty_print: bool | None = True, + parser: str | None = "lxml", + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, + ) -> str | None: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a ``write()`` function. If None, the result is returned + as a string. + index : bool, default True + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + row_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {{"": "https://example.com"}} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, default 'utf-8' + Encoding of the resulting document. + xml_declaration : bool, default True + Whether to include the XML declaration at start of document. + pretty_print : bool, default True + Whether output should be pretty printed with indentation and + line breaks. + parser : {{'lxml','etree'}}, default 'lxml' + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + Returns + ------- + None or str + If ``io`` is None, returns the resulting XML format as a + string. Otherwise returns None. + + See Also + -------- + to_json : Convert the pandas object to a JSON string. + to_html : Convert DataFrame to a html. + + Examples + -------- + >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], + ... 'degrees': [360, 360, 180], + ... 'sides': [4, np.nan, 3]}}) + + >>> df.to_xml() # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + + >>> df.to_xml(attr_cols=[ + ... 'index', 'shape', 'degrees', 'sides' + ... ]) # doctest: +SKIP + + + + + + + + >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, + ... prefix="doc") # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + """ + + from pandas.io.formats.xml import ( + EtreeXMLFormatter, + LxmlXMLFormatter, + ) + + lxml = import_optional_dependency("lxml.etree", errors="ignore") + + TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter] + + if parser == "lxml": + if lxml is not None: + TreeBuilder = LxmlXMLFormatter + else: + raise ImportError( + "lxml not found, please install or use the etree parser." + ) + + elif parser == "etree": + TreeBuilder = EtreeXMLFormatter + + else: + raise ValueError("Values for parser can only be lxml or etree.") + + xml_formatter = TreeBuilder( + self, + path_or_buffer=path_or_buffer, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, + ) + + return xml_formatter.write_output() + + # ---------------------------------------------------------------------- + @doc(INFO_DOCSTRING, **frame_sub_kwargs) + def info( + self, + verbose: bool | None = None, + buf: WriteBuffer[str] | None = None, + max_cols: int | None = None, + memory_usage: bool | str | None = None, + show_counts: bool | None = None, + null_counts: bool | None = None, + ) -> None: + if null_counts is not None: + if show_counts is not None: + raise ValueError("null_counts used with show_counts. Use show_counts.") + warnings.warn( + "null_counts is deprecated. Use show_counts instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + show_counts = null_counts + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.render( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + + @validate_bool_kwargs_from_keywords('bool', 'deep') + def memory_usage(self, index: bool = True, deep: bool = False) -> Series: + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned values. + + Returns + ------- + Series + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + Categorical : Memory-efficient array for string values with + many repeated values. + DataFrame.info : Concise summary of a DataFrame. + + Notes + ----- + See the :ref:`Frequently Asked Questions ` for more + details. + + Examples + -------- + >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) + ... for t in dtypes]) + >>> df = pd.DataFrame(data) + >>> df.head() + int64 float64 complex128 object bool + 0 1 1.0 1.0+0.0j 1 True + 1 1 1.0 1.0+0.0j 1 True + 2 1 1.0 1.0+0.0j 1 True + 3 1 1.0 1.0+0.0j 1 True + 4 1 1.0 1.0+0.0j 1 True + + >>> df.memory_usage() + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + >>> df.memory_usage(index=False) + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + The memory footprint of `object` dtype columns is ignored by default: + + >>> df.memory_usage(deep=True) + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 180000 + bool 5000 + dtype: int64 + + Use a Categorical for efficient storage of an object-dtype column with + many repeated values. + + >>> df['object'].astype('category').memory_usage(deep=True) + 5244 + """ + result = self._constructor_sliced( + [c.memory_usage(index=False, deep=deep) for col, c in self.items()], + index=self.columns, + ) + if index: + index_memory_usage = self._constructor_sliced( + self.index.memory_usage(deep=deep), index=["Index"] + ) + result = index_memory_usage._append(result) + return result + + @validate_bool_kwargs_from_keywords('copy') + def transpose(self, *args, copy: bool = False) -> DataFrame: + """ + Transpose index and columns. + + Reflect the DataFrame over its main diagonal by writing rows as columns + and vice-versa. The property :attr:`.T` is an accessor to the method + :meth:`transpose`. + + Parameters + ---------- + *args : tuple, optional + Accepted for compatibility with NumPy. + copy : bool, default False + Whether to copy the data after transposing, even for DataFrames + with a single dtype. + + Note that a copy is always required for mixed dtype DataFrames, + or for DataFrames with any extension types. + + Returns + ------- + DataFrame + The transposed DataFrame. + + See Also + -------- + numpy.transpose : Permute the dimensions of a given array. + + Notes + ----- + Transposing a DataFrame with mixed dtypes will result in a homogeneous + DataFrame with the `object` dtype. In such a case, a copy of the data + is always made. + + Examples + -------- + **Square DataFrame with homogeneous dtype** + + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} + >>> df1 = pd.DataFrame(data=d1) + >>> df1 + col1 col2 + 0 1 3 + 1 2 4 + + >>> df1_transposed = df1.T # or df1.transpose() + >>> df1_transposed + 0 1 + col1 1 2 + col2 3 4 + + When the dtype is homogeneous in the original DataFrame, we get a + transposed DataFrame with the same dtype: + + >>> df1.dtypes + col1 int64 + col2 int64 + dtype: object + >>> df1_transposed.dtypes + 0 int64 + 1 int64 + dtype: object + + **Non-square DataFrame with mixed dtypes** + + >>> d2 = {'name': ['Alice', 'Bob'], + ... 'score': [9.5, 8], + ... 'employed': [False, True], + ... 'kids': [0, 0]} + >>> df2 = pd.DataFrame(data=d2) + >>> df2 + name score employed kids + 0 Alice 9.5 False 0 + 1 Bob 8.0 True 0 + + >>> df2_transposed = df2.T # or df2.transpose() + >>> df2_transposed + 0 1 + name Alice Bob + score 9.5 8.0 + employed False True + kids 0 0 + + When the DataFrame has mixed dtypes, we get a transposed DataFrame with + the `object` dtype: + + >>> df2.dtypes + name object + score float64 + employed bool + kids int64 + dtype: object + >>> df2_transposed.dtypes + 0 object + 1 object + dtype: object + """ + nv.validate_transpose(args, {}) + # construct the args + + dtypes = list(self.dtypes) + + if self._can_fast_transpose: + # Note: tests pass without this, but this improves perf quite a bit. + new_vals = self._values.T + if copy: + new_vals = new_vals.copy() + + result = self._constructor(new_vals, index=self.columns, columns=self.index) + + elif ( + self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]) + ): + # We have EAs with the same dtype. We can preserve that dtype in transpose. + dtype = dtypes[0] + arr_type = dtype.construct_array_type() + values = self.values + + new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] + result = type(self)._from_arrays( + new_values, index=self.columns, columns=self.index + ) + + else: + new_arr = self.values.T + if copy: + new_arr = new_arr.copy() + result = self._constructor(new_arr, index=self.columns, columns=self.index) + + return result.__finalize__(self, method="transpose") + + @property + def T(self) -> DataFrame: + return self.transpose() + + # ---------------------------------------------------------------------- + # Indexing Methods + + def _ixs(self, i: int, axis: int = 0): + """ + Parameters + ---------- + i : int + axis : int + + Notes + ----- + If slice passed, the resulting data will be a view. + """ + # irow + if axis == 0: + new_values = self._mgr.fast_xs(i) + + # if we are a copy, mark as such + copy = isinstance(new_values, np.ndarray) and new_values.base is None + result = self._constructor_sliced( + new_values, + index=self.columns, + name=self.index[i], + dtype=new_values.dtype, + ).__finalize__(self) + result._set_is_copy(self, copy=copy) + return result + + # icol + else: + label = self.columns[i] + + col_mgr = self._mgr.iget(i) + result = self._box_col_values(col_mgr, i) + + # this is a cached value, mark it so + result._set_as_cached(label, self) + return result + + def _get_column_array(self, i: int) -> ArrayLike: + """ + Get the values of the i'th column (ndarray or ExtensionArray, as stored + in the Block) + """ + return self._mgr.iget_values(i) + + def _iter_column_arrays(self) -> Iterator[ArrayLike]: + """ + Iterate over the arrays of all columns in order. + This returns the values as stored in the Block (ndarray or ExtensionArray). + """ + for i in range(len(self.columns)): + yield self._get_column_array(i) + + def __getitem__(self, key): + check_deprecated_indexers(key) + key = lib.item_from_zerodim(key) + key = com.apply_if_callable(key, self) + + if is_hashable(key) and not is_iterator(key): + # is_iterator to exclude generator e.g. test_getitem_listlike + # shortcut if the key is in columns + if self.columns.is_unique and key in self.columns: + if isinstance(self.columns, MultiIndex): + return self._getitem_multilevel(key) + return self._get_item_cache(key) + + # Do we have a slicer (on rows)? + indexer = convert_to_index_sliceable(self, key) + if indexer is not None: + if isinstance(indexer, np.ndarray): + indexer = lib.maybe_indices_to_slice( + indexer.astype(np.intp, copy=False), len(self) + ) + if isinstance(indexer, np.ndarray): + # GH#43223 If we can not convert, use take + return self.take(indexer, axis=0) + # either we have a slice or we have a string that can be converted + # to a slice for partial-string date indexing + return self._slice(indexer, axis=0) + + # Do we have a (boolean) DataFrame? + if isinstance(key, DataFrame): + return self.where(key) + + # Do we have a (boolean) 1d indexer? + if com.is_bool_indexer(key): + return self._getitem_bool_array(key) + + # We are left with two options: a single key, and a collection of keys, + # We interpret tuples as collections only for non-MultiIndex + is_single_key = isinstance(key, tuple) or not is_list_like(key) + + if is_single_key: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + indexer = self.columns.get_loc(key) + if is_integer(indexer): + indexer = [indexer] + else: + if is_iterator(key): + key = list(key) + indexer = self.columns._get_indexer_strict(key, "columns")[1] + + # take() does not accept boolean indexers + if getattr(indexer, "dtype", None) == bool: + indexer = np.where(indexer)[0] + + data = self._take_with_is_copy(indexer, axis=1) + + if is_single_key: + # What does looking for a single key in a non-unique index return? + # The behavior is inconsistent. It returns a Series, except when + # - the key itself is repeated (test on data.shape, #9519), or + # - we have a MultiIndex on columns (test on self.columns, #21309) + if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + # GH#26490 using data[key] can cause RecursionError + return data._get_item_cache(key) + + return data + + def _getitem_bool_array(self, key): + # also raises Exception if object array with NA values + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn( + "Boolean Series key will be reindexed to match DataFrame index.", + UserWarning, + stacklevel=find_stack_level(), + ) + elif len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}." + ) + + # check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + return self._take_with_is_copy(indexer, axis=0) + + def _getitem_multilevel(self, key): + # self.columns is a MultiIndex + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, np.ndarray)): + new_columns = self.columns[loc] + result_columns = maybe_droplevels(new_columns, key) + if self._is_mixed_type: + result = self.reindex(columns=new_columns) + result.columns = result_columns + else: + new_values = self.values[:, loc] + result = self._constructor( + new_values, index=self.index, columns=result_columns + ) + result = result.__finalize__(self) + + # If there is only one column being returned, and its name is + # either an empty string, or a tuple with an empty string as its + # first element, then treat the empty string as a placeholder + # and return the column as if the user had provided that empty + # string in the key. If the result is a Series, exclude the + # implied empty string from its name. + if len(result.columns) == 1: + top = result.columns[0] + if isinstance(top, tuple): + top = top[0] + if top == "": + result = result[""] + if isinstance(result, Series): + result = self._constructor_sliced( + result, index=self.index, name=key + ) + + result._set_is_copy(self) + return result + else: + # loc is neither a slice nor ndarray, so must be an int + return self._ixs(loc, axis=1) + + @validate_bool_kwargs_from_keywords('takeable') + def _get_value(self, index, col, takeable: bool = False) -> Scalar: + """ + Quickly retrieve single value at passed column and index. + + Parameters + ---------- + index : row label + col : column label + takeable : interpret the index/col as indexers, default False + + Returns + ------- + scalar + + Notes + ----- + Assumes that both `self.index._index_as_unique` and + `self.columns._index_as_unique`; Caller is responsible for checking. + """ + if takeable: + series = self._ixs(col, axis=1) + return series._values[index] + + series = self._get_item_cache(col) + engine = self.index._engine + + if not isinstance(self.index, MultiIndex): + # CategoricalIndex: Trying to use the engine fastpath may give incorrect + # results if our categories are integers that dont match our codes + # IntervalIndex: IntervalTree has no get_loc + row = self.index.get_loc(index) + return series._values[row] + + # For MultiIndex going through engine effectively restricts us to + # same-length tuples; see test_get_set_value_no_partial_indexing + loc = engine.get_loc(index) + return series._values[loc] + + def __setitem__(self, key, value): + key = com.apply_if_callable(key, self) + + # see if we can slice the rows + indexer = convert_to_index_sliceable(self, key) + if indexer is not None: + # either we have a slice or we have a string that can be converted + # to a slice for partial-string date indexing + return self._setitem_slice(indexer, value) + + if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: + self._setitem_frame(key, value) + elif isinstance(key, (Series, np.ndarray, list, Index)): + self._setitem_array(key, value) + elif isinstance(value, DataFrame): + self._set_item_frame_value(key, value) + elif ( + is_list_like(value) + and not self.columns.is_unique + and 1 < len(self.columns.get_indexer_for([key])) == len(value) + ): + # Column to set is duplicated + self._setitem_array([key], value) + else: + # set column + self._set_item(key, value) + + def _setitem_slice(self, key: slice, value): + # NB: we can't just use self.loc[key] = value because that + # operates on labels and we need to operate positional for + # backwards-compat, xref GH#31469 + self._check_setitem_copy() + self.iloc[key] = value + + def _setitem_array(self, key, value): + # also raises Exception if object array with NA values + if com.is_bool_indexer(key): + # bool indexer is indexing along rows + if len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}!" + ) + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + self._check_setitem_copy() + if isinstance(value, DataFrame): + # GH#39931 reindex since iloc does not align + value = value.reindex(self.index.take(indexer)) + self.iloc[indexer] = value + + else: + # Note: unlike self.iloc[:, indexer] = value, this will + # never try to overwrite values inplace + + if isinstance(value, DataFrame): + check_key_length(self.columns, key, value) + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] + + elif not is_list_like(value): + for col in key: + self[col] = value + + elif isinstance(value, np.ndarray) and value.ndim == 2: + self._iset_not_inplace(key, value) + + elif np.ndim(value) > 1: + # list of lists + value = DataFrame(value).values + return self._setitem_array(key, value) + + else: + self._iset_not_inplace(key, value) + + def _iset_not_inplace(self, key, value): + # GH#39510 when setting with df[key] = obj with a list-like key and + # list-like value, we iterate over those listlikes and set columns + # one at a time. This is different from dispatching to + # `self.loc[:, key]= value` because loc.__setitem__ may overwrite + # data inplace, whereas this will insert new arrays. + + def igetitem(obj, i: int): + # Note: we catch DataFrame obj before getting here, but + # hypothetically would return obj.iloc[:, i] + if isinstance(obj, np.ndarray): + return obj[..., i] + else: + return obj[i] + + if self.columns.is_unique: + if np.shape(value)[-1] != len(key): + raise ValueError("Columns must be same length as key") + + for i, col in enumerate(key): + self[col] = igetitem(value, i) + + else: + + ilocs = self.columns.get_indexer_non_unique(key)[0] + if (ilocs < 0).any(): + # key entries not in self.columns + raise NotImplementedError + + if np.shape(value)[-1] != len(ilocs): + raise ValueError("Columns must be same length as key") + + assert np.ndim(value) <= 2 + + orig_columns = self.columns + + # Using self.iloc[:, i] = ... may set values inplace, which + # by convention we do not do in __setitem__ + try: + self.columns = Index(range(len(self.columns))) + for i, iloc in enumerate(ilocs): + self[iloc] = igetitem(value, i) + finally: + self.columns = orig_columns + + def _setitem_frame(self, key, value): + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 + if isinstance(key, np.ndarray): + if key.shape != self.shape: + raise ValueError("Array conditional must be same shape as self") + key = self._constructor(key, **self._construct_axes_dict()) + + if key.size and not is_bool_dtype(key.values): + raise TypeError( + "Must pass DataFrame or 2-d ndarray with boolean values only" + ) + + self._check_inplace_setting(value) + self._check_setitem_copy() + self._where(-key, value, inplace=True) + + def _set_item_frame_value(self, key, value: DataFrame) -> None: + self._ensure_valid_index(value) + + # align columns + if key in self.columns: + loc = self.columns.get_loc(key) + cols = self.columns[loc] + len_cols = 1 if is_scalar(cols) else len(cols) + if len_cols != len(value.columns): + raise ValueError("Columns must be same length as key") + + # align right-hand-side columns if self.columns + # is multi-index and self[key] is a sub-frame + if isinstance(self.columns, MultiIndex) and isinstance( + loc, (slice, Series, np.ndarray, Index) + ): + cols = maybe_droplevels(cols, key) + if len(cols) and not cols.equals(value.columns): + value = value.reindex(cols, axis=1) + + # now align rows + arraylike = _reindex_for_setitem(value, self.index) + self._set_item_mgr(key, arraylike) + + def _iset_item_mgr( + self, loc: int | slice | np.ndarray, value, inplace: bool = False + ) -> None: + # when called from _set_item_mgr loc can be anything returned from get_loc + self._mgr.iset(loc, value, inplace=inplace) + self._clear_item_cache() + + def _set_item_mgr(self, key, value: ArrayLike) -> None: + try: + loc = self._info_axis.get_loc(key) + except KeyError: + # This item wasn't present, just insert at end + self._mgr.insert(len(self._info_axis), key, value) + else: + self._iset_item_mgr(loc, value) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + + def _iset_item(self, loc: int, value) -> None: + arraylike = self._sanitize_column(value) + self._iset_item_mgr(loc, arraylike, inplace=True) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + + def _set_item(self, key, value) -> None: + """ + Add series to DataFrame in specified column. + + If series is a numpy-array (not a Series/TimeSeries), it must be the + same length as the DataFrames index or an error will be thrown. + + Series/TimeSeries will be conformed to the DataFrames index to + ensure homogeneity. + """ + value = self._sanitize_column(value) + + if ( + key in self.columns + and value.ndim == 1 + and not is_extension_array_dtype(value) + ): + # broadcast across multiple columns if necessary + if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + existing_piece = self[key] + if isinstance(existing_piece, DataFrame): + value = np.tile(value, (len(existing_piece.columns), 1)).T + + self._set_item_mgr(key, value) + + def _set_value( + self, index: IndexLabel, col, value: Scalar, takeable: bool = False + ) -> None: + """ + Put single value at passed column and index. + + Parameters + ---------- + index : Label + row label + col : Label + column label + value : scalar + takeable : bool, default False + Sets whether or not index/col interpreted as indexers + """ + try: + if takeable: + series = self._ixs(col, axis=1) + loc = index + else: + series = self._get_item_cache(col) + loc = self.index.get_loc(index) + + # setitem_inplace will do validation that may raise TypeError, + # ValueError, or LossySetitemError + series._mgr.setitem_inplace(loc, value) + + except (KeyError, TypeError, ValueError, LossySetitemError): + # set using a non-recursive method & reset the cache + if takeable: + self.iloc[index, col] = value + else: + self.loc[index, col] = value + self._item_cache.pop(col, None) + + def _ensure_valid_index(self, value) -> None: + """ + Ensure that if we don't have an index, that we can create one from the + passed value. + """ + # GH5632, make sure that we are a Series convertible + if not len(self.index) and is_list_like(value) and len(value): + if not isinstance(value, DataFrame): + try: + value = Series(value) + except (ValueError, NotImplementedError, TypeError) as err: + raise ValueError( + "Cannot set a frame with no defined index " + "and a value that cannot be converted to a Series" + ) from err + + # GH31368 preserve name of index + index_copy = value.index.copy() + if self.index.name is not None: + index_copy.name = self.index.name + + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) + + def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: + """ + Provide boxed values for a column. + """ + # Lookup in columns so that if e.g. a str datetime was passed + # we attach the Timestamp object as the name. + name = self.columns[loc] + klass = self._constructor_sliced + # We get index=self.index bc values is a SingleDataManager + return klass(values, name=name, fastpath=True).__finalize__(self) + + # ---------------------------------------------------------------------- + # Lookup Caching + + def _clear_item_cache(self) -> None: + self._item_cache.clear() + + def _get_item_cache(self, item: Hashable) -> Series: + """Return the cached item, item represents a label indexer.""" + cache = self._item_cache + res = cache.get(item) + if res is None: + # All places that call _get_item_cache have unique columns, + # pending resolution of GH#33047 + + loc = self.columns.get_loc(item) + res = self._ixs(loc, axis=1) + + cache[item] = res + + # for a chain + res._is_copy = self._is_copy + return res + + def _reset_cacher(self) -> None: + # no-op for DataFrame + pass + + @validate_bool_kwargs_from_keywords('inplace') + def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: + """ + The object has called back to us saying maybe it has changed. + """ + loc = self._info_axis.get_loc(item) + arraylike = value._values + + old = self._ixs(loc, axis=1) + if old._values is value._values and inplace: + # GH#46149 avoid making unnecessary copies/block-splitting + return + + self._mgr.iset(loc, arraylike, inplace=inplace) + + # ---------------------------------------------------------------------- + # Unsorted + + @validate_bool_kwargs_from_keywords('inplace') + def query(self, expr: str, inplace: bool = False, **kwargs): + """ + Query the columns of a DataFrame with a boolean expression. + + Parameters + ---------- + expr : str + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + .. versionadded:: 0.25.0 + Backtick quoting introduced. + + .. versionadded:: 1.0.0 + Expanding functionality of backtick quoting for more than only spaces. + + inplace : bool + Whether the query should modify the data in place or return + a modified copy. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + + Returns + ------- + DataFrame or None + DataFrame resulting from the provided query expression or + None if ``inplace=True``. + + See Also + -------- + eval : Evaluate a string describing operations on + DataFrame columns. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`eval` function to + evaluate the passed query. + + The :meth:`~pandas.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. + + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. + + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for the frame index; you can also + use the name of the index to identify it in a query. Please note that + Python keywords may not be used as identifiers. + + For further details and examples see the ``query`` documentation in + :ref:`indexing `. + + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + >>> df.query('A > B') + A B C C + 4 5 2 6 + + The previous expression is equivalent to + + >>> df[df.A > df.B] + A B C C + 4 5 2 6 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not isinstance(expr, str): + msg = f"expr must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None + res = self.eval(expr, **kwargs) + + try: + result = self.loc[res] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + result = self[res] + + if inplace: + self._update_inplace(result) + return None + else: + return result + + @validate_bool_kwargs_from_keywords('inplace') + def eval(self, expr: str, inplace: bool = False, **kwargs): + """ + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.assign : Can evaluate an expression or function to create new + values for a column. + eval : Evaluate a Python expression as a string using various + backends. + + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Use ``inplace=True`` to modify the original DataFrame. + + >>> df.eval('C = A + B', inplace=True) + >>> df + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + from pandas.core.computation.eval import eval as _eval + + inplace = validate_bool_kwarg(inplace, "inplace") + kwargs["level"] = kwargs.pop("level", 0) + 1 + index_resolvers = self._get_index_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() + resolvers = column_resolvers, index_resolvers + if "target" not in kwargs: + kwargs["target"] = self + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers + + return _eval(expr, inplace=inplace, **kwargs) + + def select_dtypes(self, include=None, exclude=None) -> DataFrame: + """ + Return a subset of the DataFrame's columns based on the column dtypes. + + Parameters + ---------- + include, exclude : scalar or list-like + A selection of dtypes or strings to be included/excluded. At least + one of these parameters must be supplied. + + Returns + ------- + DataFrame + The subset of the frame including the dtypes in ``include`` and + excluding the dtypes in ``exclude``. + + Raises + ------ + ValueError + * If both of ``include`` and ``exclude`` are empty + * If ``include`` and ``exclude`` have overlapping elements + * If any kind of string dtype is passed in. + + See Also + -------- + DataFrame.dtypes: Return Series with the data type of each column. + + Notes + ----- + * To select all *numeric* types, use ``np.number`` or ``'number'`` + * To select strings you must use the ``object`` dtype, but note that + this will return *all* object dtype columns + * See the `numpy dtype hierarchy + `__ + * To select datetimes, use ``np.datetime64``, ``'datetime'`` or + ``'datetime64'`` + * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or + ``'timedelta64'`` + * To select Pandas categorical dtypes, use ``'category'`` + * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in + 0.20.0) or ``'datetime64[ns, tz]'`` + + Examples + -------- + >>> df = pd.DataFrame({'a': [1, 2] * 3, + ... 'b': [True, False] * 3, + ... 'c': [1.0, 2.0] * 3}) + >>> df + a b c + 0 1 True 1.0 + 1 2 False 2.0 + 2 1 True 1.0 + 3 2 False 2.0 + 4 1 True 1.0 + 5 2 False 2.0 + + >>> df.select_dtypes(include='bool') + b + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + + >>> df.select_dtypes(include=['float64']) + c + 0 1.0 + 1 2.0 + 2 1.0 + 3 2.0 + 4 1.0 + 5 2.0 + + >>> df.select_dtypes(exclude=['int64']) + b c + 0 True 1.0 + 1 False 2.0 + 2 True 1.0 + 3 False 2.0 + 4 True 1.0 + 5 False 2.0 + """ + if not is_list_like(include): + include = (include,) if include is not None else () + if not is_list_like(exclude): + exclude = (exclude,) if exclude is not None else () + + selection = (frozenset(include), frozenset(exclude)) + + if not any(selection): + raise ValueError("at least one of include or exclude must be nonempty") + + # convert the myriad valid dtypes object to a single representation + def check_int_infer_dtype(dtypes): + converted_dtypes: list[type] = [] + for dtype in dtypes: + # Numpy maps int to different types (int32, in64) on Windows and Linux + # see https://github.com/numpy/numpy/issues/9464 + if (isinstance(dtype, str) and dtype == "int") or (dtype is int): + converted_dtypes.append(np.int32) + converted_dtypes.append(np.int64) + elif dtype == "float" or dtype is float: + # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20 + converted_dtypes.extend([np.float64, np.float32]) + else: + converted_dtypes.append(infer_dtype_from_object(dtype)) + return frozenset(converted_dtypes) + + include = check_int_infer_dtype(include) + exclude = check_int_infer_dtype(exclude) + + for dtypes in (include, exclude): + invalidate_string_dtypes(dtypes) + + # can't both include AND exclude! + if not include.isdisjoint(exclude): + raise ValueError(f"include and exclude overlap on {(include & exclude)}") + + def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: + return issubclass(dtype.type, tuple(dtypes_set)) or ( + np.number in dtypes_set and getattr(dtype, "_is_numeric", False) + ) + + def predicate(arr: ArrayLike) -> bool: + dtype = arr.dtype + if include: + if not dtype_predicate(dtype, include): + return False + + if exclude: + if dtype_predicate(dtype, exclude): + return False + + return True + + mgr = self._mgr._get_data_subset(predicate) + return type(self)(mgr).__finalize__(self) + + def insert( + self, + loc: int, + column: Hashable, + value: Scalar | AnyArrayLike, + allow_duplicates: bool | lib.NoDefault = lib.no_default, + ) -> None: + """ + Insert column into DataFrame at specified location. + + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. + + Parameters + ---------- + loc : int + Insertion index. Must verify 0 <= loc <= len(columns). + column : str, number, or hashable object + Label of the inserted column. + value : Scalar, Series, or array-like + allow_duplicates : bool, optional, default lib.no_default + + See Also + -------- + Index.insert : Insert new item by index. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + >>> df.insert(1, "newcol", [99, 99]) + >>> df + col1 newcol col2 + 0 1 99 3 + 1 2 99 4 + >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) + >>> df + col1 col1 newcol col2 + 0 100 1 99 3 + 1 100 2 99 4 + + Notice that pandas uses index alignment in case of `value` from type `Series`: + + >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) + >>> df + col0 col1 col1 newcol col2 + 0 NaN 100 1 99 3 + 1 5.0 100 2 99 4 + """ + if allow_duplicates is lib.no_default: + allow_duplicates = False + if allow_duplicates and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) + if not allow_duplicates and column in self.columns: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {column}, already exists") + if not isinstance(loc, int): + raise TypeError("loc must be int") + + value = self._sanitize_column(value) + self._mgr.insert(loc, column, value) + + def assign(self, **kwargs) -> DataFrame: + r""" + Assign new columns to a DataFrame. + + Returns a new object with all original columns in addition to new ones. + Existing columns that are re-assigned will be overwritten. + + Parameters + ---------- + **kwargs : dict of {str: callable or Series} + The column names are keywords. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. The callable must not + change input DataFrame (though pandas doesn't check it). + If the values are not callable, (e.g. a Series, scalar, or array), + they are simply assigned. + + Returns + ------- + DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. + + Notes + ----- + Assigning multiple columns within the same ``assign`` is possible. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. + + Examples + -------- + >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, + ... index=['Portland', 'Berkeley']) + >>> df + temp_c + Portland 17.0 + Berkeley 25.0 + + Where the value is a callable, evaluated on `df`: + + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + Alternatively, the same behavior can be achieved by directly + referencing an existing Series or sequence: + + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: + + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, + ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) + temp_c temp_f temp_k + Portland 17.0 62.6 290.15 + Berkeley 25.0 77.0 298.15 + """ + data = self.copy() + + for k, v in kwargs.items(): + data[k] = com.apply_if_callable(v, data) + return data + + def _sanitize_column(self, value) -> ArrayLike: + """ + Ensures new columns (which go into the BlockManager as new blocks) are + always copied and converted into an array. + + Parameters + ---------- + value : scalar, Series, or array-like + + Returns + ------- + numpy.ndarray or ExtensionArray + """ + self._ensure_valid_index(value) + + # We should never get here with DataFrame value + if isinstance(value, Series): + return _reindex_for_setitem(value, self.index) + + if is_list_like(value): + com.require_length_match(value, self.index) + return sanitize_array(value, self.index, copy=True, allow_2d=True) + + @property + def _series(self): + return { + item: Series( + self._mgr.iget(idx), index=self.index, name=item, fastpath=True + ) + for idx, item in enumerate(self.columns) + } + + def lookup( + self, row_labels: Sequence[IndexLabel], col_labels: Sequence[IndexLabel] + ) -> np.ndarray: + """ + Label-based "fancy indexing" function for DataFrame. + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + + .. deprecated:: 1.2.0 + DataFrame.lookup is deprecated, + use pandas.factorize and NumPy indexing instead. + For further details see + :ref:`Looking up values by index/column labels `. + + Parameters + ---------- + row_labels : sequence + The row labels to use for lookup. + col_labels : sequence + The column labels to use for lookup. + + Returns + ------- + numpy.ndarray + The found values. + """ + msg = ( + "The 'lookup' method is deprecated and will be " + "removed in a future version. " + "You can use DataFrame.melt and DataFrame.loc " + "as a substitute." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + + n = len(row_labels) + if n != len(col_labels): + raise ValueError("Row labels must have same size as column labels") + if not (self.index.is_unique and self.columns.is_unique): + # GH#33041 + raise ValueError("DataFrame.lookup requires unique index and columns") + + thresh = 1000 + if not self._is_mixed_type or n > thresh: + values = self.values + ridx = self.index.get_indexer(row_labels) + cidx = self.columns.get_indexer(col_labels) + if (ridx == -1).any(): + raise KeyError("One or more row labels was not found") + if (cidx == -1).any(): + raise KeyError("One or more column labels was not found") + flat_index = ridx * len(self.columns) + cidx + result = values.flat[flat_index] + else: + result = np.empty(n, dtype="O") + for i, (r, c) in enumerate(zip(row_labels, col_labels)): + result[i] = self._get_value(r, c) + + if is_object_dtype(result): + result = lib.maybe_convert_objects(result) + + return result + + # ---------------------------------------------------------------------- + # Reindexing and alignment + + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): + frame = self + + columns = axes["columns"] + if columns is not None: + frame = frame._reindex_columns( + columns, method, copy, level, fill_value, limit, tolerance + ) + + index = axes["index"] + if index is not None: + frame = frame._reindex_index( + index, method, copy, level, fill_value, limit, tolerance + ) + + return frame + + def _reindex_index( + self, + new_index, + method, + copy: bool, + level: Level, + fill_value=np.nan, + limit=None, + tolerance=None, + ): + new_index, indexer = self.index.reindex( + new_index, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {0: [new_index, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) + + def _reindex_columns( + self, + new_columns, + method, + copy: bool, + level: Level, + fill_value=None, + limit=None, + tolerance=None, + ): + new_columns, indexer = self.columns.reindex( + new_columns, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {1: [new_columns, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) + + def _reindex_multi( + self, axes: dict[str, Index], copy: bool, fill_value + ) -> DataFrame: + """ + We are guaranteed non-Nones in the axes. + """ + + new_index, row_indexer = self.index.reindex(axes["index"]) + new_columns, col_indexer = self.columns.reindex(axes["columns"]) + + if row_indexer is not None and col_indexer is not None: + # Fastpath. By doing two 'take's at once we avoid making an + # unnecessary copy. + # We only get here with `not self._is_mixed_type`, which (almost) + # ensures that self.values is cheap. It may be worth making this + # condition more specific. + indexer = row_indexer, col_indexer + new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) + return self._constructor(new_values, index=new_index, columns=new_columns) + else: + return self._reindex_with_indexers( + {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + copy=copy, + fill_value=fill_value, + ) + + @doc(NDFrame.align, **_shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('copy') + def align( + self, + other, + join: str = "outer", + axis: Axis | None = None, + level: Level | None = None, + copy: bool = True, + fill_value=None, + method: str | None = None, + limit=None, + fill_axis: Axis = 0, + broadcast_axis: Axis | None = None, + ) -> DataFrame: + return super().align( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) + + @overload + def set_axis( + self, labels, axis: Axis = ..., inplace: Literal[False] = ... + ) -> DataFrame: + ... + + @overload + def set_axis(self, labels, axis: Axis, inplace: Literal[True]) -> None: + ... + + @overload + def set_axis(self, labels, *, inplace: Literal[True]) -> None: + ... + + @overload + def set_axis( + self, labels, axis: Axis = ..., inplace: bool = ... + ) -> DataFrame | None: + ... + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) + @Appender( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + Change the row labels. + + >>> df.set_axis(['a', 'b', 'c'], axis='index') + A B + a 1 4 + b 2 5 + c 3 6 + + Change the column labels. + + >>> df.set_axis(['I', 'II'], axis='columns') + I II + 0 1 4 + 1 2 5 + 2 3 6 + + Now, update the labels inplace. + + >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) + >>> df + i ii + 0 1 4 + 1 2 5 + 2 3 6 + """ + ) + @Substitution( + **_shared_doc_kwargs, + extended_summary_sub=" column or", + axis_description_sub=", and 1 identifies the columns", + see_also_sub=" or columns", + ) + @Appender(NDFrame.set_axis.__doc__) + @validate_bool_kwargs_from_keywords('inplace') + def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): + return super().set_axis(labels, axis=axis, inplace=inplace) + + @Substitution(**_shared_doc_kwargs) + @Appender(NDFrame.reindex.__doc__) + @rewrite_axis_style_signature( + "labels", + [ + ("method", None), + ("copy", True), + ("level", None), + ("fill_value", np.nan), + ("limit", None), + ("tolerance", None), + ], + ) + def reindex(self, *args, **kwargs) -> DataFrame: + axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") + kwargs.update(axes) + # Pop these, since the values are in `kwargs` under different names + kwargs.pop("axis", None) + kwargs.pop("labels", None) + return super().reindex(**kwargs) + + @overload + def drop( + self, + labels: Hashable | list[Hashable] = ..., + *, + axis: Axis = ..., + index: Hashable | list[Hashable] = ..., + columns: Hashable | list[Hashable] = ..., + level: Level | None = ..., + inplace: Literal[True], + errors: IgnoreRaise = ..., + ) -> None: + ... + + @overload + def drop( + self, + labels: Hashable | list[Hashable] = ..., + *, + axis: Axis = ..., + index: Hashable | list[Hashable] = ..., + columns: Hashable | list[Hashable] = ..., + level: Level | None = ..., + inplace: Literal[False] = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame: + ... + + @overload + def drop( + self, + labels: Hashable | list[Hashable] = ..., + *, + axis: Axis = ..., + index: Hashable | list[Hashable] = ..., + columns: Hashable | list[Hashable] = ..., + level: Level | None = ..., + inplace: bool = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame | None: + ... + + # error: Signature of "drop" incompatible with supertype "NDFrame" + # github.com/python/mypy/issues/12387 + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) + @validate_bool_kwargs_from_keywords('inplace') + def drop( # type: ignore[override] + self, + labels: Hashable | list[Hashable] = None, + axis: Axis = 0, + index: Hashable | list[Hashable] = None, + columns: Hashable | list[Hashable] = None, + level: Level | None = None, + inplace: bool = False, + errors: IgnoreRaise = "raise", + ) -> DataFrame | None: + """ + Drop specified labels from rows or columns. + + Remove rows or columns by specifying label names and corresponding + axis, or by specifying directly index or column names. When using a + multi-index, labels on different levels can be removed by specifying + the level. See the `user guide ` + for more information about the now unused levels. + + Parameters + ---------- + labels : single label or list-like + Index or column labels to drop. A tuple will be used as a single + label and not treated as a list-like. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Whether to drop labels from the index (0 or 'index') or + columns (1 or 'columns'). + index : single label or list-like + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). + columns : single label or list-like + Alternative to specifying axis (``labels, axis=1`` + is equivalent to ``columns=labels``). + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If False, return a copy. Otherwise, do operation + inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. + + Returns + ------- + DataFrame or None + DataFrame without the removed index or column labels or + None if ``inplace=True``. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis. + + See Also + -------- + DataFrame.loc : Label-location based indexer for selection by label. + DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. + DataFrame.drop_duplicates : Return DataFrame with duplicate rows + removed, optionally only considering certain columns. + Series.drop : Return Series with specified index labels removed. + + Examples + -------- + >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + Drop columns + + >>> df.drop(['B', 'C'], axis=1) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + >>> df.drop(columns=['B', 'C']) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + Drop a row by index + + >>> df.drop([0, 1]) + A B C D + 2 8 9 10 11 + + Drop columns and/or rows of MultiIndex DataFrame + + >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) + >>> df + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 + + Drop a specific index combination from the MultiIndex + DataFrame, i.e., drop the combination ``'falcon'`` and + ``'weight'``, which deletes only the corresponding row + + >>> df.drop(index=('falcon', 'weight')) + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + length 0.3 0.2 + + >>> df.drop(index='cow', columns='small') + big + lama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + + >>> df.drop(index='length', level=1) + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + falcon speed 320.0 250.0 + weight 1.0 0.8 + """ + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool = ..., + inplace: Literal[True], + level: Level | None = ..., + errors: IgnoreRaise = ..., + ) -> None: + ... + + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool = ..., + inplace: Literal[False] = ..., + level: Level | None = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame: + ... + + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool = ..., + inplace: bool = ..., + level: Level | None = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame | None: + ... + + @validate_bool_kwargs_from_keywords('inplace', 'copy') + def rename( + self, + mapper: Renamer | None = None, + *, + index: Renamer | None = None, + columns: Renamer | None = None, + axis: Axis | None = None, + copy: bool = True, + inplace: bool = False, + level: Level | None = None, + errors: IgnoreRaise = "ignore", + ) -> DataFrame | None: + """ + Alter axes labels. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. + + See the :ref:`user guide ` for more. + + Parameters + ---------- + mapper : dict-like or function + Dict-like or function transformations to apply to + that axis' values. Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` and + ``columns``. + index : dict-like or function + Alternative to specifying axis (``mapper, axis=0`` + is equivalent to ``index=mapper``). + columns : dict-like or function + Alternative to specifying axis (``mapper, axis=1`` + is equivalent to ``columns=mapper``). + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis to target with ``mapper``. Can be either the axis name + ('index', 'columns') or number (0, 1). The default is 'index'. + copy : bool, default True + Also copy underlying data. + inplace : bool, default False + Whether to return a new DataFrame. If True then value of copy is + ignored. + level : int or level name, default None + In case of a MultiIndex, only rename labels in the specified + level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. + + Returns + ------- + DataFrame or None + DataFrame with the renamed axis labels or None if ``inplace=True``. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". + + See Also + -------- + DataFrame.rename_axis : Set the name of the axis. + + Examples + -------- + ``DataFrame.rename`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Rename columns using a mapping: + + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df.rename(columns={"A": "a", "B": "c"}) + a c + 0 1 4 + 1 2 5 + 2 3 6 + + Rename index using a mapping: + + >>> df.rename(index={0: "x", 1: "y", 2: "z"}) + A B + x 1 4 + y 2 5 + z 3 6 + + Cast index labels to a different type: + + >>> df.index + RangeIndex(start=0, stop=3, step=1) + >>> df.rename(index=str).index + Index(['0', '1', '2'], dtype='object') + + >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") + Traceback (most recent call last): + KeyError: ['C'] not found in axis + + Using axis-style parameters: + + >>> df.rename(str.lower, axis='columns') + a b + 0 1 4 + 1 2 5 + 2 3 6 + + >>> df.rename({1: 2, 2: 4}, axis='index') + A B + 0 1 4 + 2 2 5 + 4 3 6 + """ + return super()._rename( + mapper=mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) + + @overload + def fillna( + self, + value=..., + method: FillnaOptions | None = ..., + axis: Axis | None = ..., + inplace: Literal[False] = ..., + limit=..., + downcast=..., + ) -> DataFrame: + ... + + @overload + def fillna( + self, + value, + method: FillnaOptions | None, + axis: Axis | None, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + *, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + value, + *, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + *, + method: FillnaOptions | None, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + *, + axis: Axis | None, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + *, + method: FillnaOptions | None, + axis: Axis | None, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + value, + *, + axis: Axis | None, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + value, + method: FillnaOptions | None, + *, + inplace: Literal[True], + limit=..., + downcast=..., + ) -> None: + ... + + @overload + def fillna( + self, + value=..., + method: FillnaOptions | None = ..., + axis: Axis | None = ..., + inplace: bool = ..., + limit=..., + downcast=..., + ) -> DataFrame | None: + ... + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) + @doc(NDFrame.fillna, **_shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('inplace') + def fillna( + self, + value: object | ArrayLike | None = None, + method: FillnaOptions | None = None, + axis: Axis | None = None, + inplace: bool = False, + limit=None, + downcast=None, + ) -> DataFrame | None: + return super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + + def pop(self, item: Hashable) -> Series: + """ + Return item and drop from frame. Raise KeyError if not found. + + Parameters + ---------- + item : label + Label of column to be popped. + + Returns + ------- + Series + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=('name', 'class', 'max_speed')) + >>> df + name class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + >>> df.pop('class') + 0 bird + 1 bird + 2 mammal + 3 mammal + Name: class, dtype: object + + >>> df + name max_speed + 0 falcon 389.0 + 1 parrot 24.0 + 2 lion 80.5 + 3 monkey NaN + """ + return super().pop(item=item) + + @validate_bool_kwargs_from_keywords('inplace') + @doc(NDFrame.replace, **_shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=lib.no_default, + inplace: bool = False, + limit=None, + regex: bool = False, + method: str | lib.NoDefault = lib.no_default, + ): + return super().replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + def _replace_columnwise( + self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex + ): + """ + Dispatch to Series.replace column-wise. + + Parameters + ---------- + mapping : dict + of the form {col: (target, value)} + inplace : bool + regex : bool or same types as `to_replace` in DataFrame.replace + + Returns + ------- + DataFrame or None + """ + # Operate column-wise + res = self if inplace else self.copy() + ax = self.columns + + for i in range(len(ax)): + if ax[i] in mapping: + ser = self.iloc[:, i] + + target, value = mapping[ax[i]] + newobj = ser.replace(target, value, regex=regex) + + res.iloc[:, i] = newobj + + if inplace: + return + return res.__finalize__(self) + + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) + def shift( + self, + periods=1, + freq: Frequency | None = None, + axis: Axis = 0, + fill_value=lib.no_default, + ) -> DataFrame: + axis = self._get_axis_number(axis) + + ncols = len(self.columns) + if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: + # We will infer fill_value to match the closest column + + # Use a column that we know is valid for our column's dtype GH#38434 + label = self.columns[0] + + if periods > 0: + result = self.iloc[:, :-periods] + for col in range(min(ncols, abs(periods))): + # TODO(EA2D): doing this in a loop unnecessary with 2D EAs + # Define filler inside loop so we get a copy + filler = self.iloc[:, 0].shift(len(self)) + result.insert(0, label, filler, allow_duplicates=True) + else: + result = self.iloc[:, -periods:] + for col in range(min(ncols, abs(periods))): + # Define filler inside loop so we get a copy + filler = self.iloc[:, -1].shift(len(self)) + result.insert( + len(result.columns), label, filler, allow_duplicates=True + ) + + result.columns = self.columns.copy() + return result + elif ( + axis == 1 + and periods != 0 + and fill_value is not lib.no_default + and ncols > 0 + ): + arrays = self._mgr.arrays + if len(arrays) > 1 or ( + # If we only have one block and we know that we can't + # keep the same dtype (i.e. the _can_hold_element check) + # then we can go through the reindex_indexer path + # (and avoid casting logic in the Block method). + # The exception to this (until 2.0) is datetimelike + # dtypes with integers, which cast. + not can_hold_element(arrays[0], fill_value) + # TODO(2.0): remove special case for integer-with-datetimelike + # once deprecation is enforced + and not ( + lib.is_integer(fill_value) and needs_i8_conversion(arrays[0].dtype) + ) + ): + # GH#35488 we need to watch out for multi-block cases + # We only get here with fill_value not-lib.no_default + nper = abs(periods) + nper = min(nper, ncols) + if periods > 0: + indexer = np.array( + [-1] * nper + list(range(ncols - periods)), dtype=np.intp + ) + else: + indexer = np.array( + list(range(nper, ncols)) + [-1] * nper, dtype=np.intp + ) + mgr = self._mgr.reindex_indexer( + self.columns, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + ) + res_df = self._constructor(mgr) + return res_df.__finalize__(self, method="shift") + + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"]) + @validate_bool_kwargs_from_keywords('inplace', 'drop', 'append', 'verify_integrity') + def set_index( + self, + keys, + drop: bool = True, + append: bool = False, + inplace: bool = False, + verify_integrity: bool = False, + ): + """ + Set the DataFrame index using existing columns. + + Set the DataFrame index (row labels) using one or more existing + columns or arrays (of the correct length). The index can replace the + existing index or expand on it. + + Parameters + ---------- + keys : label or array-like or list of labels/arrays + This parameter can be either a single column key, a single array of + the same length as the calling DataFrame, or a list containing an + arbitrary combination of column keys and arrays. Here, "array" + encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and + instances of :class:`~collections.abc.Iterator`. + drop : bool, default True + Delete columns to be used as the new index. + append : bool, default False + Whether to append columns to existing index. + inplace : bool, default False + If True, modifies the DataFrame in place (do not create a new object). + verify_integrity : bool, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method. + + Returns + ------- + DataFrame or None + Changed row labels or None if ``inplace=True``. + + See Also + -------- + DataFrame.reset_index : Opposite of set_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'month': [1, 4, 7, 10], + ... 'year': [2012, 2014, 2013, 2014], + ... 'sale': [55, 40, 84, 31]}) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 + + Set the index to become the 'month' column: + + >>> df.set_index('month') + year sale + month + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 + + Create a MultiIndex using columns 'year' and 'month': + + >>> df.set_index(['year', 'month']) + sale + year month + 2012 1 55 + 2014 4 40 + 2013 7 84 + 2014 10 31 + + Create a MultiIndex using an Index and a column: + + >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) + month sale + year + 1 2012 1 55 + 2 2014 4 40 + 3 2013 7 84 + 4 2014 10 31 + + Create a MultiIndex using two Series: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> df.set_index([s, s**2]) + month year sale + 1 1 1 2012 55 + 2 4 4 2014 40 + 3 9 7 2013 84 + 4 16 10 2014 31 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) + if not isinstance(keys, list): + keys = [keys] + + err_msg = ( + 'The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + "one-dimensional arrays." + ) + + missing: list[Hashable] = [] + for col in keys: + if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): + # arrays are fine as long as they are one-dimensional + # iterators get converted to list below + if getattr(col, "ndim", 1) != 1: + raise ValueError(err_msg) + else: + # everything else gets tried as a key; see GH 24969 + try: + found = col in self.columns + except TypeError as err: + raise TypeError( + f"{err_msg}. Received column of type {type(col)}" + ) from err + else: + if not found: + missing.append(col) + + if missing: + raise KeyError(f"None of {missing} are in the columns") + + if inplace: + frame = self + else: + frame = self.copy() + + arrays = [] + names: list[Hashable] = [] + if append: + names = list(self.index.names) + if isinstance(self.index, MultiIndex): + for i in range(self.index.nlevels): + arrays.append(self.index._get_level_values(i)) + else: + arrays.append(self.index) + + to_remove: list[Hashable] = [] + for col in keys: + if isinstance(col, MultiIndex): + for n in range(col.nlevels): + arrays.append(col._get_level_values(n)) + names.extend(col.names) + elif isinstance(col, (Index, Series)): + # if Index then not MultiIndex (treated above) + + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[Index, Series]"; expected "Index" + arrays.append(col) # type:ignore[arg-type] + names.append(col.name) + elif isinstance(col, (list, np.ndarray)): + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[List[Any], ndarray]"; expected "Index" + arrays.append(col) # type: ignore[arg-type] + names.append(None) + elif isinstance(col, abc.Iterator): + # error: Argument 1 to "append" of "list" has incompatible type + # "List[Any]"; expected "Index" + arrays.append(list(col)) # type: ignore[arg-type] + names.append(None) + # from here, col can only be a column label + else: + arrays.append(frame[col]._values) + names.append(col) + if drop: + to_remove.append(col) + + if len(arrays[-1]) != len(self): + # check newest element against length of calling frame, since + # ensure_index_from_sequences would not raise for append=False. + raise ValueError( + f"Length mismatch: Expected {len(self)} rows, " + f"received array of length {len(arrays[-1])}" + ) + + index = ensure_index_from_sequences(arrays, names) + + if verify_integrity and not index.is_unique: + duplicates = index[index.duplicated()].unique() + raise ValueError(f"Index has duplicate keys: {duplicates}") + + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): + del frame[c] + + # clear up memory usage + index._cleanup() + + frame.index = index + + if not inplace: + return frame + + @overload + def reset_index( + self, + level: Hashable | Sequence[Hashable] | None = ..., + drop: bool = ..., + inplace: Literal[False] = ..., + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] = None, + ) -> DataFrame: + ... + + @overload + def reset_index( + self, + level: Hashable | Sequence[Hashable] | None, + drop: bool, + inplace: Literal[True], + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] = None, + ) -> None: + ... + + @overload + def reset_index( + self, + *, + drop: bool, + inplace: Literal[True], + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] = None, + ) -> None: + ... + + @overload + def reset_index( + self, + level: Hashable | Sequence[Hashable] | None, + *, + inplace: Literal[True], + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] = None, + ) -> None: + ... + + @overload + def reset_index( + self, + *, + inplace: Literal[True], + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] = None, + ) -> None: + ... + + @overload + def reset_index( + self, + level: Hashable | Sequence[Hashable] | None = ..., + drop: bool = ..., + inplace: bool = ..., + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] = None, + ) -> DataFrame | None: + ... + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) + @validate_bool_kwargs_from_keywords('inplace', 'drop') + def reset_index( + self, + level: Hashable | Sequence[Hashable] | None = None, + drop: bool = False, + inplace: bool = False, + col_level: Hashable = 0, + col_fill: Hashable = "", + allow_duplicates: bool | lib.NoDefault = lib.no_default, + names: Hashable | Sequence[Hashable] = None, + ) -> DataFrame | None: + """ + Reset the index, or a level of it. + + Reset the index of the DataFrame, and use the default one instead. + If the DataFrame has a MultiIndex, this method can remove one or more + levels. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + col_level : int or str, default 0 + If the columns have multiple levels, determines which level the + labels are inserted into. By default it is inserted into the first + level. + col_fill : object, default '' + If the columns have multiple levels, determines how the other + levels are named. If None then the index name is repeated. + allow_duplicates : bool, optional, default lib.no_default + Allow duplicate column labels to be created. + + .. versionadded:: 1.5.0 + + names : int, str or 1-dimensional list, default None + Using the given string, rename the DataFrame column which contains the + index data. If the DataFrame has a MultiIndex, this has to be a list or + tuple with length equal to the number of levels. + + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame or None + DataFrame with the new index or None if ``inplace=True``. + + See Also + -------- + DataFrame.set_index : Opposite of reset_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + >>> df = pd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal NaN + + When we reset the index, the old index is added as a column, and a + new sequential index is used: + + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + We can use the `drop` parameter to avoid the old index being added as + a column: + + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal NaN + + You can also use `reset_index` with `MultiIndex`. + + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), + ... ('species', 'type')]) + >>> df = pd.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=columns) + >>> df + speed species + max type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey NaN jump + + Using the `names` parameter, choose a name for the index column: + + >>> df.reset_index(names=['classes', 'names']) + classes names speed species + max type + 0 bird falcon 389.0 fly + 1 bird parrot 24.0 fly + 2 mammal lion 80.5 run + 3 mammal monkey NaN jump + + If the index has multiple levels, we can reset a subset of them: + + >>> df.reset_index(level='class') + class speed species + max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we are not dropping the index, by default, it is placed in the top + level. We can place it in another level: + + >>> df.reset_index(level='class', col_level=1) + speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + When the index is inserted under another level, we can specify under + which one with the parameter `col_fill`: + + >>> df.reset_index(level='class', col_level=1, col_fill='species') + species speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we specify a nonexistent level for `col_fill`, it is created: + + >>> df.reset_index(level='class', col_level=1, col_fill='genus') + genus speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + """ + inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) + if inplace: + new_obj = self + else: + new_obj = self.copy() + if allow_duplicates is not lib.no_default: + allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") + + new_index = default_index(len(new_obj)) + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) + + if not drop: + to_insert: Iterable[tuple[Any, Any | None]] + + default = "index" if "index" not in self else "level_0" + names = self.index._get_default_index_names(names, default) + + if isinstance(self.index, MultiIndex): + to_insert = zip(self.index.levels, self.index.codes) + else: + to_insert = ((self.index, None),) + + multi_col = isinstance(self.columns, MultiIndex) + for i, (lev, lab) in reversed(list(enumerate(to_insert))): + if level is not None and i not in level: + continue + name = names[i] + if multi_col: + col_name = list(name) if isinstance(name, tuple) else [name] + if col_fill is None: + if len(col_name) not in (1, self.columns.nlevels): + raise ValueError( + "col_fill=None is incompatible " + f"with incomplete column name {name}" + ) + col_fill = col_name[0] + + lev_num = self.columns._get_level_number(col_level) + name_lst = [col_fill] * lev_num + col_name + missing = self.columns.nlevels - len(name_lst) + name_lst += [col_fill] * missing + name = tuple(name_lst) + + # to ndarray and maybe infer different dtype + level_values = lev._values + if level_values.dtype == np.object_: + level_values = lib.maybe_convert_objects(level_values) + + if lab is not None: + # if we have the codes, extract the values with a mask + level_values = algorithms.take( + level_values, lab, allow_fill=True, fill_value=lev._na_value + ) + + new_obj.insert( + 0, + name, + level_values, + allow_duplicates=allow_duplicates, + ) + + new_obj.index = new_index + if not inplace: + return new_obj + + return None + + # ---------------------------------------------------------------------- + # Reindex-based selection methods + + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) + def isna(self) -> DataFrame: + result = self._constructor(self._mgr.isna(func=isna)) + return result.__finalize__(self, method="isna") + + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) + def isnull(self) -> DataFrame: + """ + DataFrame.isnull is an alias for DataFrame.isna. + """ + return self.isna() + + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) + def notna(self) -> DataFrame: + return ~self.isna() + + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) + def notnull(self) -> DataFrame: + """ + DataFrame.notnull is an alias for DataFrame.notna. + """ + return ~self.isna() + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') + def dropna( + self, + axis: Axis = 0, + how: str | NoDefault = no_default, + thresh: int | NoDefault = no_default, + subset: IndexLabel = None, + inplace: bool = False, + ): + """ + Remove missing values. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + + .. versionchanged:: 1.0.0 + + Pass tuple or list to drop on multiple axes. + Only a single axis is allowed. + + how : {'any', 'all'}, default 'any' + Determine if row or column is removed from DataFrame, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + + thresh : int, optional + Require that many non-NA values. Cannot be combined with how. + subset : column label or sequence of labels, optional + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include. + inplace : bool, default False + If True, do operation inplace and return None. + + Returns + ------- + DataFrame or None + DataFrame with NA entries dropped from it or None if ``inplace=True``. + + See Also + -------- + DataFrame.isna: Indicate missing values. + DataFrame.notna : Indicate existing (non-missing) values. + DataFrame.fillna : Replace missing values. + Series.dropna : Drop missing values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), + ... pd.NaT]}) + >>> df + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Drop the rows where at least one element is missing. + + >>> df.dropna() + name toy born + 1 Batman Batmobile 1940-04-25 + + Drop the columns where at least one element is missing. + + >>> df.dropna(axis='columns') + name + 0 Alfred + 1 Batman + 2 Catwoman + + Drop the rows where all elements are missing. + + >>> df.dropna(how='all') + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Keep only the rows with at least 2 non-NA values. + + >>> df.dropna(thresh=2) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'toy']) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Keep the DataFrame with valid entries in the same variable. + + >>> df.dropna(inplace=True) + >>> df + name toy born + 1 Batman Batmobile 1940-04-25 + """ + if (how is not no_default) and (thresh is not no_default): + raise TypeError( + "You cannot set both the how and thresh arguments at the same time." + ) + + if how is no_default: + how = "any" + + inplace = validate_bool_kwarg(inplace, "inplace") + if isinstance(axis, (tuple, list)): + # GH20987 + raise TypeError("supplying multiple axes to axis is no longer supported.") + + axis = self._get_axis_number(axis) + agg_axis = 1 - axis + + agg_obj = self + if subset is not None: + # subset needs to be list + if not is_list_like(subset): + subset = [subset] + ax = self._get_axis(agg_axis) + indices = ax.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(np.array(subset)[check].tolist()) + agg_obj = self.take(indices, axis=agg_axis) + + if thresh is not no_default: + count = agg_obj.count(axis=agg_axis) + mask = count >= thresh + elif how == "any": + # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' + mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) + elif how == "all": + # faster equivalent to 'agg_obj.count(agg_axis) > 0' + mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) + else: + if how is not no_default: + raise ValueError(f"invalid how option: {how}") + + if np.all(mask): + result = self.copy() + else: + result = self.loc(axis=axis)[mask] + + if inplace: + self._update_inplace(result) + else: + return result + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) + @validate_bool_kwargs_from_keywords('inplace', 'ignore_index') + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = None, + keep: Literal["first"] | Literal["last"] | Literal[False] = "first", + inplace: bool = False, + ignore_index: bool = False, + ) -> DataFrame | None: + """ + Return DataFrame with duplicate rows removed. + + Considering certain columns is optional. Indexes, including time indexes + are ignored. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + inplace : bool, default False + Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + DataFrame or None + DataFrame with duplicates removed or None if ``inplace=True``. + + See Also + -------- + DataFrame.value_counts: Count unique combinations of columns. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, it removes duplicate rows based on all columns. + + >>> df.drop_duplicates() + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + To remove duplicates on specific column(s), use ``subset``. + + >>> df.drop_duplicates(subset=['brand']) + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + + To remove duplicates and keep last occurrences, use ``keep``. + + >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') + brand style rating + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 + """ + if self.empty: + return self.copy() + + inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") + duplicated = self.duplicated(subset, keep=keep) + + result = self[-duplicated] + if ignore_index: + result.index = default_index(len(result)) + + if inplace: + self._update_inplace(result) + return None + else: + return result + + def duplicated( + self, + subset: Hashable | Sequence[Hashable] | None = None, + keep: Literal["first"] | Literal["last"] | Literal[False] = "first", + ) -> Series: + """ + Return boolean Series denoting duplicate rows. + + Considering certain columns is optional. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to mark. + + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + Series + Boolean series for each duplicated rows. + + See Also + -------- + Index.duplicated : Equivalent method on index. + Series.duplicated : Equivalent method on Series. + Series.drop_duplicates : Remove duplicate values from Series. + DataFrame.drop_duplicates : Remove duplicate values from DataFrame. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, for each set of duplicated values, the first occurrence + is set on False and all others on True. + + >>> df.duplicated() + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True. + + >>> df.duplicated(keep='last') + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + By setting ``keep`` on False, all duplicates are True. + + >>> df.duplicated(keep=False) + 0 True + 1 True + 2 False + 3 False + 4 False + dtype: bool + + To find duplicates on specific column(s), use ``subset``. + + >>> df.duplicated(subset=['brand']) + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool + """ + + if self.empty: + return self._constructor_sliced(dtype=bool) + + def f(vals) -> tuple[np.ndarray, int]: + labels, shape = algorithms.factorize(vals, size_hint=len(self)) + return labels.astype("i8", copy=False), len(shape) + + if subset is None: + # https://github.com/pandas-dev/pandas/issues/28770 + # Incompatible types in assignment (expression has type "Index", variable + # has type "Sequence[Any]") + subset = self.columns # type: ignore[assignment] + elif ( + not np.iterable(subset) + or isinstance(subset, str) + or isinstance(subset, tuple) + and subset in self.columns + ): + subset = (subset,) + + # needed for mypy since can't narrow types using np.iterable + subset = cast(Sequence, subset) + + # Verify all columns in subset exist in the queried dataframe + # Otherwise, raise a KeyError, same as if you try to __getitem__ with a + # key that doesn't exist. + diff = set(subset) - set(self.columns) + if diff: + raise KeyError(Index(diff)) + + if len(subset) == 1 and self.columns.is_unique: + # GH#45236 This is faster than get_group_index below + result = self[subset[0]].duplicated(keep) + result.name = None + else: + vals = (col.values for name, col in self.items() if name in subset) + labels, shape = map(list, zip(*map(f, vals))) + + ids = get_group_index( + labels, + # error: Argument 1 to "tuple" has incompatible type "List[_T]"; + # expected "Iterable[int]" + tuple(shape), # type: ignore[arg-type] + sort=False, + xnull=False, + ) + result = self._constructor_sliced(duplicated(ids, keep), index=self.index) + return result.__finalize__(self, method="duplicated") + + # ---------------------------------------------------------------------- + # Sorting + # TODO: Just move the sort_values doc here. + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"]) + @Substitution(**_shared_doc_kwargs) + @Appender(NDFrame.sort_values.__doc__) + @validate_bool_kwargs_from_keywords('inplace', 'ignore_index') + # error: Signature of "sort_values" incompatible with supertype "NDFrame" + def sort_values( # type: ignore[override] + self, + by, + axis: Axis = 0, + ascending=True, + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool = False, + key: ValueKeyFunc = None, + ): + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + ascending = validate_ascending(ascending) + if not isinstance(by, list): + by = [by] + if is_sequence(ascending) and len(by) != len(ascending): + raise ValueError( + f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" + ) + if len(by) > 1: + + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + + # need to rewrap columns in Series to apply key function + if key is not None: + # error: List comprehension has incompatible type List[Series]; + # expected List[ndarray] + keys = [ + Series(k, name=name) # type: ignore[misc] + for (k, name) in zip(keys, by) + ] + + indexer = lexsort_indexer( + keys, orders=ascending, na_position=na_position, key=key + ) + elif len(by): + # len(by) == 1 + + by = by[0] + k = self._get_label_or_level_values(by, axis=axis) + + # need to rewrap column in Series to apply key function + if key is not None: + # error: Incompatible types in assignment (expression has type + # "Series", variable has type "ndarray") + k = Series(k, name=by) # type: ignore[assignment] + + if isinstance(ascending, (tuple, list)): + ascending = ascending[0] + + indexer = nargsort( + k, kind=kind, ascending=ascending, na_position=na_position, key=key + ) + else: + return self.copy() + + new_data = self._mgr.take( + indexer, axis=self._get_block_manager_axis(axis), verify=False + ) + + if ignore_index: + new_data.set_axis( + self._get_block_manager_axis(axis), default_index(len(indexer)) + ) + + result = self._constructor(new_data) + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_values") + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> DataFrame: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool | Sequence[bool] = ..., + inplace: bool = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> DataFrame | None: + ... + + # error: Signature of "sort_index" incompatible with supertype "NDFrame" + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace', 'sort_remaining', 'ignore_index') + def sort_index( # type: ignore[override] + self, + axis: Axis = 0, + level: Level | None = None, + ascending: bool | Sequence[bool] = True, + inplace: bool = False, + kind: SortKind = "quicksort", + na_position: NaPosition = "last", + sort_remaining: bool = True, + ignore_index: bool = False, + key: IndexKeyFunc = None, + ) -> DataFrame | None: + """ + Sort object by labels (along an axis). + + Returns a new DataFrame sorted by label if `inplace` argument is + ``False``, otherwise updates the original DataFrame and returns None. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + ascending : bool or list-like of bools, default True + Sort ascending vs. descending. When the index is a MultiIndex the + sort direction can be controlled for each level individually. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. `mergesort` and `stable` are the only stable algorithms. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + key : callable, optional + If not None, apply the key function to the index values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. For MultiIndex + inputs, the key is applied *per level*. + + .. versionadded:: 1.1.0 + + Returns + ------- + DataFrame or None + The original DataFrame sorted by the labels or None if ``inplace=True``. + + See Also + -------- + Series.sort_index : Sort Series by the index. + DataFrame.sort_values : Sort DataFrame by the value. + Series.sort_values : Sort Series by the value. + + Examples + -------- + >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], + ... columns=['A']) + >>> df.sort_index() + A + 1 4 + 29 2 + 100 1 + 150 5 + 234 3 + + By default, it sorts in ascending order, to sort in descending order, + use ``ascending=False`` + + >>> df.sort_index(ascending=False) + A + 234 3 + 150 5 + 100 1 + 29 2 + 1 4 + + A key function can be specified which is applied to the index before + sorting. For a ``MultiIndex`` this is applied to each level separately. + + >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) + >>> df.sort_index(key=lambda x: x.str.lower()) + a + A 1 + b 2 + C 3 + d 4 + """ + return super().sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + key=key, + ) + + @validate_bool_kwargs_from_keywords('normalize', 'sort', 'ascending', 'dropna') + def value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ): + """ + Return a Series containing counts of unique rows in the DataFrame. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don’t include counts of rows that contain NA values. + + .. versionadded:: 1.3.0 + + Returns + ------- + Series + + See Also + -------- + Series.value_counts: Equivalent method on Series. + + Notes + ----- + The returned Series will have a MultiIndex with one level per input + column. By default, rows that contain any NA values are omitted from + the result. By default, the resulting Series will be in descending + order so that the first element is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], + ... 'num_wings': [2, 0, 0, 0]}, + ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 + + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 2 2 1 + 6 0 1 + dtype: int64 + + >>> df.value_counts(sort=False) + num_legs num_wings + 2 2 1 + 4 0 2 + 6 0 1 + dtype: int64 + + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + dtype: int64 + + >>> df.value_counts(normalize=True) + num_legs num_wings + 4 0 0.50 + 2 2 0.25 + 6 0 0.25 + dtype: float64 + + With `dropna` set to `False` we can also count rows with NA values. + + >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], + ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) + >>> df + first_name middle_name + 0 John Smith + 1 Anne + 2 John + 3 Beth Louise + + >>> df.value_counts() + first_name middle_name + Beth Louise 1 + John Smith 1 + dtype: int64 + + >>> df.value_counts(dropna=False) + first_name middle_name + Anne NaN 1 + Beth Louise 1 + John Smith 1 + NaN 1 + dtype: int64 + """ + if subset is None: + subset = self.columns.tolist() + + counts = self.groupby(subset, dropna=dropna).grouper.size() + + if sort: + counts = counts.sort_values(ascending=ascending) + if normalize: + counts /= counts.sum() + + # Force MultiIndex for single column + if len(subset) == 1: + counts.index = MultiIndex.from_arrays( + [counts.index], names=[counts.index.name] + ) + + return counts + + def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: + """ + Return the first `n` rows ordered by `columns` in descending order. + + Return the first `n` rows with the largest values in `columns`, in + descending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=False).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of rows to return. + columns : label or list of labels + Column label(s) to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - ``first`` : prioritize the first occurrence(s) + - ``last`` : prioritize the last occurrence(s) + - ``all`` : do not drop any duplicates, even it means + selecting more than `n` items. + + Returns + ------- + DataFrame + The first `n` rows ordered by the given columns in descending + order. + + See Also + -------- + DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in + ascending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Notes + ----- + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + + Examples + -------- + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nlargest`` to select the three + rows having the largest values in column "population". + + >>> df.nlargest(3, 'population') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nlargest(3, 'population', keep='last') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + + When using ``keep='all'``, all duplicate items are maintained: + + >>> df.nlargest(3, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + + To order by the largest values in column "population" and then "GDP", + we can specify multiple columns like in the next example. + + >>> df.nlargest(3, ['population', 'GDP']) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + """ + return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() + + def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: + """ + Return the first `n` rows ordered by `columns` in ascending order. + + Return the first `n` rows with the smallest values in `columns`, in + ascending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=True).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of items to retrieve. + columns : list or str + Column name or names to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. + - ``all`` : do not drop any duplicates, even it means + selecting more than `n` items. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.nlargest : Return the first `n` rows ordered by `columns` in + descending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Examples + -------- + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 337000, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 337000 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nsmallest`` to select the + three rows having the smallest values in column "population". + + >>> df.nsmallest(3, 'population') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nsmallest(3, 'population', keep='last') + population GDP alpha-2 + Anguilla 11300 311 AI + Tuvalu 11300 38 TV + Nauru 337000 182 NR + + When using ``keep='all'``, all duplicate items are maintained: + + >>> df.nsmallest(3, 'population', keep='all') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR + + To order by the smallest values in column "population" and then "GDP", we can + specify multiple columns like in the next example. + + >>> df.nsmallest(3, ['population', 'GDP']) + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Nauru 337000 182 NR + """ + return algorithms.SelectNFrame( + self, n=n, keep=keep, columns=columns + ).nsmallest() + + @doc( + Series.swaplevel, + klass=_shared_doc_kwargs["klass"], + extra_params=dedent( + """axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise.""" + ), + examples=dedent( + """\ + Examples + -------- + >>> df = pd.DataFrame( + ... {"Grade": ["A", "B", "A", "C"]}, + ... index=[ + ... ["Final exam", "Final exam", "Coursework", "Coursework"], + ... ["History", "Geography", "History", "Geography"], + ... ["January", "February", "March", "April"], + ... ], + ... ) + >>> df + Grade + Final exam History January A + Geography February B + Coursework History March A + Geography April C + + In the following example, we will swap the levels of the indices. + Here, we will swap the levels column-wise, but levels can be swapped row-wise + in a similar manner. Note that column-wise is the default behaviour. + By not supplying any arguments for i and j, we swap the last and second to + last indices. + + >>> df.swaplevel() + Grade + Final exam January History A + February Geography B + Coursework March History A + April Geography C + + By supplying one argument, we can choose which index to swap the last + index with. We can for example swap the first index with the last one as + follows. + + >>> df.swaplevel(0) + Grade + January History Final exam A + February Geography Final exam B + March History Coursework A + April Geography Coursework C + + We can also define explicitly which indices we want to swap by supplying values + for both i and j. Here, we for example swap the first and second indices. + + >>> df.swaplevel(0, 1) + Grade + History Final exam January A + Geography Final exam February B + History Coursework March A + Geography Coursework April C""" + ), + ) + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + result = self.copy() + + axis = self._get_axis_number(axis) + + if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only swap levels on a hierarchical axis.") + + if axis == 0: + assert isinstance(result.index, MultiIndex) + result.index = result.index.swaplevel(i, j) + else: + assert isinstance(result.columns, MultiIndex) + result.columns = result.columns.swaplevel(i, j) + return result + + def reorder_levels(self, order: Sequence[Axis], axis: Axis = 0) -> DataFrame: + """ + Rearrange index levels using input order. May not drop or duplicate levels. + + Parameters + ---------- + order : list of int or list of str + List representing new level order. Reference level by number + (position) or by key (label). + axis : {0 or 'index', 1 or 'columns'}, default 0 + Where to reorder levels. + + Returns + ------- + DataFrame + + Examples + -------- + >>> data = { + ... "class": ["Mammals", "Mammals", "Reptiles"], + ... "diet": ["Omnivore", "Carnivore", "Carnivore"], + ... "species": ["Humans", "Dogs", "Snakes"], + ... } + >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) + >>> df = df.set_index(["class", "diet"]) + >>> df + species + class diet + Mammals Omnivore Humans + Carnivore Dogs + Reptiles Carnivore Snakes + + Let's reorder the levels of the index: + + >>> df.reorder_levels(["diet", "class"]) + species + diet class + Omnivore Mammals Humans + Carnivore Mammals Dogs + Reptiles Snakes + """ + axis = self._get_axis_number(axis) + if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only reorder levels on a hierarchical axis.") + + result = self.copy() + + if axis == 0: + assert isinstance(result.index, MultiIndex) + result.index = result.index.reorder_levels(order) + else: + assert isinstance(result.columns, MultiIndex) + result.columns = result.columns.reorder_levels(order) + return result + + # ---------------------------------------------------------------------- + # Arithmetic Methods + + def _cmp_method(self, other, op): + axis = 1 # only relevant for Series other case + + self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) + + # See GH#4537 for discussion of scalar op behavior + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) + + def _arith_method(self, other, op): + if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None): + return ops.frame_arith_method_with_reindex(self, other, op) + + axis = 1 # only relevant for Series other case + other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) + + self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) + + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) + + _logical_method = _arith_method + + def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): + """ + Evaluate the frame operation func(left, right) by evaluating + column-by-column, dispatching to the Series implementation. + + Parameters + ---------- + right : scalar, Series, or DataFrame + func : arithmetic or comparison operator + axis : {None, 0, 1} + + Returns + ------- + DataFrame + """ + # Get the appropriate array-op to apply to each column/block's values. + array_op = ops.get_array_op(func) + + right = lib.item_from_zerodim(right) + if not is_list_like(right): + # i.e. scalar, faster than checking np.ndim(right) == 0 + with np.errstate(all="ignore"): + bm = self._mgr.apply(array_op, right=right) + return self._constructor(bm) + + elif isinstance(right, DataFrame): + assert self.index.equals(right.index) + assert self.columns.equals(right.columns) + # TODO: The previous assertion `assert right._indexed_same(self)` + # fails in cases with empty columns reached via + # _frame_arith_method_with_reindex + + # TODO operate_blockwise expects a manager of the same type + with np.errstate(all="ignore"): + bm = self._mgr.operate_blockwise( + # error: Argument 1 to "operate_blockwise" of "ArrayManager" has + # incompatible type "Union[ArrayManager, BlockManager]"; expected + # "ArrayManager" + # error: Argument 1 to "operate_blockwise" of "BlockManager" has + # incompatible type "Union[ArrayManager, BlockManager]"; expected + # "BlockManager" + right._mgr, # type: ignore[arg-type] + array_op, + ) + return self._constructor(bm) + + elif isinstance(right, Series) and axis == 1: + # axis=1 means we want to operate row-by-row + assert right.index.equals(self.columns) + + right = right._values + # maybe_align_as_frame ensures we do not have an ndarray here + assert not isinstance(right, np.ndarray) + + with np.errstate(all="ignore"): + arrays = [ + array_op(_left, _right) + for _left, _right in zip(self._iter_column_arrays(), right) + ] + + elif isinstance(right, Series): + assert right.index.equals(self.index) # Handle other cases later + right = right._values + + with np.errstate(all="ignore"): + arrays = [array_op(left, right) for left in self._iter_column_arrays()] + + else: + # Remaining cases have less-obvious dispatch rules + raise NotImplementedError(right) + + return type(self)._from_arrays( + arrays, self.columns, self.index, verify_integrity=False + ) + + def _combine_frame(self, other: DataFrame, func, fill_value=None): + # at this point we have `self._indexed_same(other)` + + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func + + else: + + def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) + left, right = ops.fill_binop(left, right, fill_value) + return func(left, right) + + new_data = self._dispatch_frame_op(other, _arith_op) + return new_data + + def _construct_result(self, result) -> DataFrame: + """ + Wrap the result of an arithmetic, comparison, or logical operation. + + Parameters + ---------- + result : DataFrame + + Returns + ------- + DataFrame + """ + out = self._constructor(result, copy=False) + # Pin columns instead of passing to constructor for compat with + # non-unique columns case + out.columns = self.columns + out.index = self.index + return out + + def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = self // other + mod = self - div * other + return div, mod + + def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = other // self + mod = other - div * self + return div, mod + + # ---------------------------------------------------------------------- + # Combination-Related + + @doc( + _shared_docs["compare"], + """ +Returns +------- +DataFrame + DataFrame that shows the differences stacked side by side. + + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. + +Raises +------ +ValueError + When the two DataFrames don't have identical labels or shape. + +See Also +-------- +Series.compare : Compare with another Series and show differences. +DataFrame.equals : Test whether two objects contain the same elements. + +Notes +----- +Matching NaNs will not appear as a difference. + +Can only compare identically-labeled +(i.e. same shape, identical row and column labels) DataFrames + +Examples +-------- +>>> df = pd.DataFrame( +... {{ +... "col1": ["a", "a", "b", "b", "a"], +... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], +... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] +... }}, +... columns=["col1", "col2", "col3"], +... ) +>>> df + col1 col2 col3 +0 a 1.0 1.0 +1 a 2.0 2.0 +2 b 3.0 3.0 +3 b NaN 4.0 +4 a 5.0 5.0 + +>>> df2 = df.copy() +>>> df2.loc[0, 'col1'] = 'c' +>>> df2.loc[2, 'col3'] = 4.0 +>>> df2 + col1 col2 col3 +0 c 1.0 1.0 +1 a 2.0 2.0 +2 b 3.0 4.0 +3 b NaN 4.0 +4 a 5.0 5.0 + +Align the differences on columns + +>>> df.compare(df2) + col1 col3 + self other self other +0 a c NaN NaN +2 NaN NaN 3.0 4.0 + +Stack the differences on rows + +>>> df.compare(df2, align_axis=0) + col1 col3 +0 self a NaN + other c NaN +2 self NaN 3.0 + other NaN 4.0 + +Keep the equal values + +>>> df.compare(df2, keep_equal=True) + col1 col3 + self other self other +0 a c 1.0 1.0 +2 b b 3.0 4.0 + +Keep all original rows and columns + +>>> df.compare(df2, keep_shape=True) + col1 col2 col3 + self other self other self other +0 a c NaN NaN NaN NaN +1 NaN NaN NaN NaN NaN NaN +2 NaN NaN NaN NaN 3.0 4.0 +3 NaN NaN NaN NaN NaN NaN +4 NaN NaN NaN NaN NaN NaN + +Keep all original rows and columns and also all original values + +>>> df.compare(df2, keep_shape=True, keep_equal=True) + col1 col2 col3 + self other self other self other +0 a c 1.0 1.0 1.0 1.0 +1 a a 2.0 2.0 2.0 2.0 +2 b b 3.0 3.0 3.0 4.0 +3 b b NaN NaN 4.0 4.0 +4 a a 5.0 5.0 5.0 5.0 +""", + klass=_shared_doc_kwargs["klass"], + ) + @validate_bool_kwargs_from_keywords('keep_shape', 'keep_equal') + def compare( + self, + other: DataFrame, + align_axis: Axis = 1, + keep_shape: bool = False, + keep_equal: bool = False, + ) -> DataFrame: + return super().compare( + other=other, + align_axis=align_axis, + keep_shape=keep_shape, + keep_equal=keep_equal, + ) + + @validate_bool_kwargs_from_keywords('overwrite') + def combine( + self, other: DataFrame, func, fill_value=None, overwrite: bool = True + ) -> DataFrame: + """ + Perform column-wise combine with another DataFrame. + + Combines a DataFrame with `other` DataFrame using `func` + to element-wise combine columns. The row and column indexes of the + resulting DataFrame will be the union of the two. + + Parameters + ---------- + other : DataFrame + The DataFrame to merge column-wise. + func : function + Function that takes two series as inputs and return a Series or a + scalar. Used to merge the two dataframes column by columns. + fill_value : scalar value, default None + The value to fill NaNs with prior to passing any column to the + merge func. + overwrite : bool, default True + If True, columns in `self` that do not exist in `other` will be + overwritten with NaNs. + + Returns + ------- + DataFrame + Combination of the provided DataFrames. + + See Also + -------- + DataFrame.combine_first : Combine two DataFrame objects and default to + non-null values in frame calling the method. + + Examples + -------- + Combine using a simple function that chooses the smaller column. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1.combine(df2, take_smaller) + A B + 0 0 3 + 1 0 3 + + Example using a true element-wise combine function. + + >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, np.minimum) + A B + 0 1 2 + 1 0 3 + + Using `fill_value` fills Nones prior to passing the column to the + merge function. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, take_smaller, fill_value=-5) + A B + 0 0 -5.0 + 1 0 4.0 + + However, if the same element in both dataframes is None, that None + is preserved + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) + >>> df1.combine(df2, take_smaller, fill_value=-5) + A B + 0 0 -5.0 + 1 0 3.0 + + Example that demonstrates the use of `overwrite` and behavior when + the axis differ between the dataframes. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) + >>> df1.combine(df2, take_smaller) + A B C + 0 NaN NaN NaN + 1 NaN 3.0 -10.0 + 2 NaN 3.0 1.0 + + >>> df1.combine(df2, take_smaller, overwrite=False) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 -10.0 + 2 NaN 3.0 1.0 + + Demonstrating the preference of the passed in dataframe. + + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) + >>> df2.combine(df1, take_smaller) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 NaN + 2 NaN 3.0 NaN + + >>> df2.combine(df1, take_smaller, overwrite=False) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 1.0 + 2 NaN 3.0 1.0 + """ + other_idxlen = len(other.index) # save for compare + + this, other = self.align(other, copy=False) + new_index = this.index + + if other.empty and len(new_index) == len(self.index): + return self.copy() + + if self.empty and len(other) == other_idxlen: + return other.copy() + + # sorts if possible + new_columns = this.columns.union(other.columns) + do_fill = fill_value is not None + result = {} + for col in new_columns: + series = this[col] + otherSeries = other[col] + + this_dtype = series.dtype + other_dtype = otherSeries.dtype + + this_mask = isna(series) + other_mask = isna(otherSeries) + + # don't overwrite columns unnecessarily + # DO propagate if this column is not in the intersection + if not overwrite and other_mask.all(): + result[col] = this[col].copy() + continue + + if do_fill: + series = series.copy() + otherSeries = otherSeries.copy() + series[this_mask] = fill_value + otherSeries[other_mask] = fill_value + + if col not in self.columns: + # If self DataFrame does not have col in other DataFrame, + # try to promote series, which is all NaN, as other_dtype. + new_dtype = other_dtype + try: + series = series.astype(new_dtype, copy=False) + except ValueError: + # e.g. new_dtype is integer types + pass + else: + # if we have different dtypes, possibly promote + new_dtype = find_common_type([this_dtype, other_dtype]) + series = series.astype(new_dtype, copy=False) + otherSeries = otherSeries.astype(new_dtype, copy=False) + + arr = func(series, otherSeries) + if isinstance(new_dtype, np.dtype): + # if new_dtype is an EA Dtype, then `func` is expected to return + # the correct dtype without any additional casting + arr = maybe_downcast_to_dtype(arr, new_dtype) + + result[col] = arr + + # convert_objects just in case + return self._constructor(result, index=new_index, columns=new_columns) + + def combine_first(self, other: DataFrame) -> DataFrame: + """ + Update null elements with value in the same location in `other`. + + Combine two DataFrame objects by filling null values in one DataFrame + with non-null values from other DataFrame. The row and column indexes + of the resulting DataFrame will be the union of the two. The resulting + dataframe contains the 'first' dataframe values and overrides the + second one values where both first.loc[index, col] and + second.loc[index, col] are not missing values, upon calling + first.combine_first(second). + + Parameters + ---------- + other : DataFrame + Provided DataFrame to use to fill null values. + + Returns + ------- + DataFrame + The result of combining the provided DataFrame with the other object. + + See Also + -------- + DataFrame.combine : Perform series-wise operation on two DataFrames + using a given function. + + Examples + -------- + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine_first(df2) + A B + 0 1.0 3.0 + 1 0.0 4.0 + + Null values still persist if the location of that null value + does not exist in `other` + + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) + >>> df1.combine_first(df2) + A B C + 0 NaN 4.0 NaN + 1 0.0 3.0 1.0 + 2 NaN 3.0 1.0 + """ + import pandas.core.computation.expressions as expressions + + def combiner(x, y): + mask = extract_array(isna(x)) + + x_values = extract_array(x, extract_numpy=True) + y_values = extract_array(y, extract_numpy=True) + + # If the column y in other DataFrame is not in first DataFrame, + # just return y_values. + if y.name not in self.columns: + return y_values + + return expressions.where(mask, y_values, x_values) + + combined = self.combine(other, combiner, overwrite=False) + + dtypes = { + col: find_common_type([self.dtypes[col], other.dtypes[col]]) + for col in self.columns.intersection(other.columns) + if not is_dtype_equal(combined.dtypes[col], self.dtypes[col]) + } + + if dtypes: + combined = combined.astype(dtypes) + + return combined + + @validate_bool_kwargs_from_keywords('overwrite') + def update( + self, + other, + join: str = "left", + overwrite: bool = True, + filter_func=None, + errors: str = "ignore", + ) -> None: + """ + Modify in place using non-NA values from another DataFrame. + + Aligns on indices. There is no return value. + + Parameters + ---------- + other : DataFrame, or object coercible into a DataFrame + Should have at least one matching index/column label + with the original DataFrame. If a Series is passed, + its name attribute must be set, and that will be + used as the column name to align with the original DataFrame. + join : {'left'}, default 'left' + Only left join is implemented, keeping the index and columns of the + original object. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original DataFrame's values + with values from `other`. + * False: only update values that are NA in + the original DataFrame. + + filter_func : callable(1d-array) -> bool 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` + both contain non-NA data in the same place. + + Returns + ------- + None : method directly changes calling object + + Raises + ------ + ValueError + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` + + See Also + -------- + dict.update : Similar method for dictionaries. + DataFrame.merge : For column(s)-on-column(s) operations. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + The DataFrame's length does not increase as a result of the update, + only values at matching index/column labels are updated. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) + >>> df.update(new_df) + >>> df + A B + 0 a d + 1 b e + 2 c f + + For Series, its name attribute must be set. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) + >>> df.update(new_column) + >>> df + A B + 0 a d + 1 b y + 2 c e + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) + >>> df.update(new_df) + >>> df + A B + 0 a x + 1 b d + 2 c e + + If `other` contains NaNs the corresponding values are not updated + in the original dataframe. + + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4.0 + 1 2 500.0 + 2 3 6.0 + """ + import pandas.core.computation.expressions as expressions + + # TODO: Support other joins + if join != "left": # pragma: no cover + raise NotImplementedError("Only left join is supported") + if errors not in ["ignore", "raise"]: + raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + + if not isinstance(other, DataFrame): + other = DataFrame(other) + + other = other.reindex_like(self) + + for col in self.columns: + this = self[col]._values + that = other[col]._values + if filter_func is not None: + with np.errstate(all="ignore"): + mask = ~filter_func(this) | isna(that) + else: + if errors == "raise": + mask_this = notna(that) + mask_that = notna(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isna(that) + else: + mask = notna(this) + + # don't overwrite columns unnecessarily + if mask.all(): + continue + + self[col] = expressions.where(mask, this, that) + + # ---------------------------------------------------------------------- + # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 + +We can also choose to include NA in group keys or not by setting +`dropna` parameter, the default setting is `True`. + +>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] +>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + +>>> df.groupby(by=["b"]).sum() + a c +b +1.0 2 3 +2.0 2 5 + +>>> df.groupby(by=["b"], dropna=False).sum() + a c +b +1.0 2 3 +2.0 2 5 +NaN 1 4 + +>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] +>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + +>>> df.groupby(by="a").sum() + b c +a +a 13.0 13.0 +b 12.3 123.0 + +>>> df.groupby(by="a", dropna=False).sum() + b c +a +a 13.0 13.0 +b 12.3 123.0 +NaN 12.3 33.0 + +When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. +The ``group_keys`` argument defaults to ``True`` (include). + +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df.groupby("Animal", group_keys=True).apply(lambda x: x) + Animal Max Speed +Animal +Falcon 0 Falcon 380.0 + 1 Falcon 370.0 +Parrot 2 Parrot 24.0 + 3 Parrot 26.0 + +>>> df.groupby("Animal", group_keys=False).apply(lambda x: x) + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('as_index', 'sort', 'observed', 'dropna') + def groupby( + self, + by=None, + axis: Axis = 0, + level: Level | None = None, + as_index: bool = True, + sort: bool = True, + group_keys: bool | lib.NoDefault = no_default, + squeeze: bool | lib.NoDefault = no_default, + observed: bool = False, + dropna: bool = True, + ) -> DataFrameGroupBy: + from pandas.core.groupby.generic import DataFrameGroupBy + + if squeeze is not no_default: + warnings.warn( + ( + "The `squeeze` parameter is deprecated and " + "will be removed in a future version." + ), + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + squeeze = False + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + # https://github.com/python/mypy/issues/7642 + # error: Argument "squeeze" to "DataFrameGroupBy" has incompatible type + # "Union[bool, NoDefault]"; expected "bool" + return DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, # type: ignore[arg-type] + observed=observed, + dropna=dropna, + ) + + _shared_docs[ + "pivot" + ] = """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ----------%s + index : str or object or a list of str, optional + Column to use to make new frame's index. If None, uses + existing index. + + .. versionchanged:: 1.1.0 + Also accept list of index names. + + columns : str or object or a list of str + Column to use to make new frame's columns. + + .. versionchanged:: 1.1.0 + Also accept list of columns names. + + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', + ... 'two'], + ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], + ... 'baz': [1, 2, 3, 4, 5, 6], + ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index='foo', columns='bar', values='baz') + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index='foo', columns='bar')['baz'] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + You could also assign a list of column names or a list of index names. + + >>> df = pd.DataFrame({ + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5]}) + >>> df + lev1 lev2 lev3 lev4 values + 0 1 1 1 1 0 + 1 1 1 2 2 1 + 2 1 2 1 3 2 + 3 2 1 2 4 3 + 4 2 1 1 5 4 + 5 2 2 2 6 5 + + >>> df.pivot(index="lev1", columns=["lev2", "lev3"],values="values") + lev2 1 2 + lev3 1 2 1 2 + lev1 + 1 0.0 1.0 2.0 NaN + 2 4.0 3.0 NaN 5.0 + + >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values") + lev3 1 2 + lev1 lev2 + 1 1 0.0 1.0 + 2 2.0 NaN + 2 1 4.0 3.0 + 2 NaN 5.0 + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], + ... "bar": ['A', 'A', 'B', 'C'], + ... "baz": [1, 2, 3, 4]}) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index='foo', columns='bar', values='baz') + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ + + @Substitution("") + @Appender(_shared_docs["pivot"]) + def pivot(self, index=None, columns=None, values=None) -> DataFrame: + from pandas.core.reshape.pivot import pivot + + return pivot(self, index=index, columns=columns, values=values) + + _shared_docs[ + "pivot_table" + ] = """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ----------%s + values : column to aggregate, optional + index : column, Grouper, array, or list of the previous + If an array is passed, it must be the same length as the data. The + list can contain any of the other types (except list). + Keys to group by on the pivot table index. If an array is passed, + it is being used as the same manner as column values. + columns : column, Grouper, array, or list of the previous + If an array is passed, it must be the same length as the data. The + list can contain any of the other types (except list). + Keys to group by on the pivot table column. If an array is passed, + it is being used as the same manner as column values. + aggfunc : function, list of functions, dict, default numpy.mean + If list of functions passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves) + If dict is passed, the key is column to aggregate and value + is function or list of functions. + fill_value : scalar, default None + Value to replace missing values with (in the resulting pivot table, + after aggregation). + margins : bool, default False + Add all row / columns (e.g. for subtotal / grand totals). + dropna : bool, default True + Do not include columns whose entries are all NaN. + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 0.25.0 + + sort : bool, default True + Specifies if the result should be sorted. + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + ... "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", + ... "one", "one", "two", "two"], + ... "C": ["small", "large", "large", "small", + ... "small", "large", "small", "small", + ... "large"], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum, fill_value=0) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': np.mean}) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': [min, max, np.mean]}) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 + """ + + @Substitution("") + @Appender(_shared_docs["pivot_table"]) + def pivot_table( + self, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + observed=False, + sort=True, + ) -> DataFrame: + from pandas.core.reshape.pivot import pivot_table + + return pivot_table( + self, + values=values, + index=index, + columns=columns, + aggfunc=aggfunc, + fill_value=fill_value, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + sort=sort, + ) + + @validate_bool_kwargs_from_keywords('dropna') + def stack(self, level: Level = -1, dropna: bool = True): + """ + Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + DataFrame. The new inner-most levels are created by pivoting the + columns of the current dataframe: + + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. + + Parameters + ---------- + level : int, str, list, default -1 + Level(s) to stack from the column axis onto the index + axis, defined as one index or label, or a list of indices + or labels. + dropna : bool, default True + Whether to drop rows in the resulting Frame/Series with + missing values. Stacking a column level onto the index + axis can create combinations of index and column values + that are missing from the original dataframe. See Examples + section. + + Returns + ------- + DataFrame or Series + Stacked dataframe or series. + + See Also + -------- + DataFrame.unstack : Unstack prescribed level(s) from index axis + onto column axis. + DataFrame.pivot : Reshape dataframe from long format to wide + format. + DataFrame.pivot_table : Create a spreadsheet-style pivot table + as a DataFrame. + + Notes + ----- + The function is named by analogy with a collection of books + being reorganized from being side by side on a horizontal + position (the columns of the dataframe) to being stacked + vertically on top of each other (in the index of the + dataframe). + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + **Single level columns** + + >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], + ... index=['cat', 'dog'], + ... columns=['weight', 'height']) + + Stacking a dataframe with a single level column axis returns a Series: + + >>> df_single_level_cols + weight height + cat 0 1 + dog 2 3 + >>> df_single_level_cols.stack() + cat weight 0 + height 1 + dog weight 2 + height 3 + dtype: int64 + + **Multi level columns: simple case** + + >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('weight', 'pounds')]) + >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], + ... index=['cat', 'dog'], + ... columns=multicol1) + + Stacking a dataframe with a multi-level column axis: + + >>> df_multi_level_cols1 + weight + kg pounds + cat 1 2 + dog 2 4 + >>> df_multi_level_cols1.stack() + weight + cat kg 1 + pounds 2 + dog kg 2 + pounds 4 + + **Missing values** + + >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('height', 'm')]) + >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) + + It is common to have missing values when stacking a dataframe + with multi-level columns, as the stacked dataframe typically + has more values than the original dataframe. Missing values + are filled with NaNs: + + >>> df_multi_level_cols2 + weight height + kg m + cat 1.0 2.0 + dog 3.0 4.0 + >>> df_multi_level_cols2.stack() + height weight + cat kg NaN 1.0 + m 2.0 NaN + dog kg NaN 3.0 + m 4.0 NaN + + **Prescribing the level(s) to be stacked** + + The first parameter controls which level or levels are stacked: + + >>> df_multi_level_cols2.stack(0) + kg m + cat height NaN 2.0 + weight 1.0 NaN + dog height NaN 4.0 + weight 3.0 NaN + >>> df_multi_level_cols2.stack([0, 1]) + cat height m 2.0 + weight kg 1.0 + dog height m 4.0 + weight kg 3.0 + dtype: float64 + + **Dropping missing values** + + >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) + + Note that rows where all values are missing are dropped by + default but this behaviour can be controlled via the dropna + keyword parameter: + + >>> df_multi_level_cols3 + weight height + kg m + cat NaN 1.0 + dog 2.0 3.0 + >>> df_multi_level_cols3.stack(dropna=False) + height weight + cat kg NaN NaN + m 1.0 NaN + dog kg NaN 2.0 + m 3.0 NaN + >>> df_multi_level_cols3.stack(dropna=True) + height weight + cat m 1.0 NaN + dog kg NaN 2.0 + m 3.0 NaN + """ + from pandas.core.reshape.reshape import ( + stack, + stack_multiple, + ) + + if isinstance(level, (tuple, list)): + result = stack_multiple(self, level, dropna=dropna) + else: + result = stack(self, level, dropna=dropna) + + return result.__finalize__(self, method="stack") + + @validate_bool_kwargs_from_keywords('ignore_index') + def explode( + self, + column: IndexLabel, + ignore_index: bool = False, + ) -> DataFrame: + """ + Transform each element of a list-like to a row, replicating index values. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + column : IndexLabel + Column(s) to explode. + For multiple columns, specify a non-empty list with each element + be str or tuple, and all specified columns their list-like data + on same row of the frame must have matching length. + + .. versionadded:: 1.3.0 + Multi-column explode + + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.1.0 + + Returns + ------- + DataFrame + Exploded lists to rows of the subset columns; + index will be duplicated for these rows. + + Raises + ------ + ValueError : + * If columns of the frame are not unique. + * If specified columns to explode is empty list. + * If specified columns to explode have not matching count of + elements rowwise in the frame. + + See Also + -------- + DataFrame.unstack : Pivot a level of the (necessarily hierarchical) + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. + Series.explode : Explode a DataFrame from list-like columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, sets, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of rows in the + output will be non-deterministic when exploding sets. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], + ... 'B': 1, + ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) + >>> df + A B C + 0 [0, 1, 2] 1 [a, b, c] + 1 foo 1 NaN + 2 [] 1 [] + 3 [3, 4] 1 [d, e] + + Single-column explode. + + >>> df.explode('A') + A B C + 0 0 1 [a, b, c] + 0 1 1 [a, b, c] + 0 2 1 [a, b, c] + 1 foo 1 NaN + 2 NaN 1 [] + 3 3 1 [d, e] + 3 4 1 [d, e] + + Multi-column explode. + + >>> df.explode(list('AC')) + A B C + 0 0 1 a + 0 1 1 b + 0 2 1 c + 1 foo 1 NaN + 2 NaN 1 NaN + 3 3 1 d + 3 4 1 e + """ + if not self.columns.is_unique: + raise ValueError("columns must be unique") + + columns: list[Hashable] + if is_scalar(column) or isinstance(column, tuple): + columns = [column] + elif isinstance(column, list) and all( + map(lambda c: is_scalar(c) or isinstance(c, tuple), column) + ): + if not column: + raise ValueError("column must be nonempty") + if len(column) > len(set(column)): + raise ValueError("column must be unique") + columns = column + else: + raise ValueError("column must be a scalar, tuple, or list thereof") + + df = self.reset_index(drop=True) + if len(columns) == 1: + result = df[columns[0]].explode() + else: + mylen = lambda x: len(x) if is_list_like(x) else -1 + counts0 = self[columns[0]].apply(mylen) + for c in columns[1:]: + if not all(counts0 == self[c].apply(mylen)): + raise ValueError("columns must have matching element counts") + result = DataFrame({c: df[c].explode() for c in columns}) + result = df.drop(columns, axis=1).join(result) + if ignore_index: + result.index = default_index(len(result)) + else: + result.index = self.index.take(result.index) + result = result.reindex(columns=self.columns, copy=False) + + return result.__finalize__(self, method="explode") + + def unstack(self, level: Level = -1, fill_value=None): + """ + Pivot a level of the (necessarily hierarchical) index labels. + + Returns a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. + + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + + Parameters + ---------- + level : int, str, or list of these, default -1 (last level) + Level(s) of index to unstack, can pass level name. + fill_value : int, str or dict + Replace NaN with this value if the unstack produces missing values. + + Returns + ------- + Series or DataFrame + + See Also + -------- + DataFrame.pivot : Pivot a table based on column values. + DataFrame.stack : Pivot a level of the column labels (inverse operation + from `unstack`). + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) + >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s + one a 1.0 + b 2.0 + two a 3.0 + b 4.0 + dtype: float64 + + >>> s.unstack(level=-1) + a b + one 1.0 2.0 + two 3.0 4.0 + + >>> s.unstack(level=0) + one two + a 1.0 3.0 + b 2.0 4.0 + + >>> df = s.unstack(level=0) + >>> df.unstack() + one a 1.0 + b 2.0 + two a 3.0 + b 4.0 + dtype: float64 + """ + from pandas.core.reshape.reshape import unstack + + result = unstack(self, level, fill_value) + + return result.__finalize__(self, method="unstack") + + @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level: Level | None = None, + ignore_index: bool = True, + ) -> DataFrame: + + return melt( + self, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ignore_index=ignore_index, + ).__finalize__(self, method="melt") + + # ---------------------------------------------------------------------- + # Time series-related + + @doc( + Series.diff, + klass="DataFrame", + extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " + "Take difference over rows (0) or columns (1).\n", + other_klass="Series", + examples=dedent( + """ + Difference with previous row + + >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> df + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 + + >>> df.diff() + a b c + 0 NaN NaN NaN + 1 1.0 0.0 3.0 + 2 1.0 1.0 5.0 + 3 1.0 1.0 7.0 + 4 1.0 2.0 9.0 + 5 1.0 3.0 11.0 + + Difference with previous column + + >>> df.diff(axis=1) + a b c + 0 NaN 0 0 + 1 NaN -1 3 + 2 NaN -1 7 + 3 NaN -1 13 + 4 NaN 0 20 + 5 NaN 2 28 + + Difference with 3rd previous row + + >>> df.diff(periods=3) + a b c + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 3.0 2.0 15.0 + 4 3.0 4.0 21.0 + 5 3.0 6.0 27.0 + + Difference with following row + + >>> df.diff(periods=-1) + a b c + 0 -1.0 0.0 -3.0 + 1 -1.0 -1.0 -5.0 + 2 -1.0 -1.0 -7.0 + 3 -1.0 -2.0 -9.0 + 4 -1.0 -3.0 -11.0 + 5 NaN NaN NaN + + Overflow in input dtype + + >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) + >>> df.diff() + a + 0 NaN + 1 255.0""" + ), + ) + def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: + if not lib.is_integer(periods): + if not ( + is_float(periods) + # error: "int" has no attribute "is_integer" + and periods.is_integer() # type: ignore[attr-defined] + ): + raise ValueError("periods must be an integer") + periods = int(periods) + + axis = self._get_axis_number(axis) + if axis == 1 and periods != 0: + return self - self.shift(periods, axis=axis) + + new_data = self._mgr.diff(n=periods, axis=axis) + return self._constructor(new_data).__finalize__(self, "diff") + + # ---------------------------------------------------------------------- + # Function application + + def _gotitem( + self, + key: IndexLabel, + ndim: int, + subset: DataFrame | Series | None = None, + ) -> DataFrame | Series: + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + if subset is None: + subset = self + elif subset.ndim == 1: # is Series + return subset + + # TODO: _shallow_copy(subset)? + return subset[key] + + _agg_summary_and_see_also_doc = dedent( + """ + The aggregation operations are always performed over an axis, either the + index (default) or the column axis. This behavior is different from + `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, + `var`), where the default is to compute the aggregation of the flattened + array, e.g., ``numpy.mean(arr_2d)`` as opposed to + ``numpy.mean(arr_2d, axis=0)``. + + `agg` is an alias for `aggregate`. Use the alias. + + See Also + -------- + DataFrame.apply : Perform any type of operations. + DataFrame.transform : Perform transformation type operations. + core.groupby.GroupBy : Perform operations over groups. + core.resample.Resampler : Perform operations over resampled bins. + core.window.Rolling : Perform operations over rolling window. + core.window.Expanding : Perform operations over expanding window. + core.window.ExponentialMovingWindow : Perform operation over exponential weighted + window. + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], + ... [4, 5, 6], + ... [7, 8, 9], + ... [np.nan, np.nan, np.nan]], + ... columns=['A', 'B', 'C']) + + Aggregate these functions over the rows. + + >>> df.agg(['sum', 'min']) + A B C + sum 12.0 15.0 18.0 + min 1.0 2.0 3.0 + + Different aggregations per column. + + >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) + A B + sum 12.0 NaN + min 1.0 2.0 + max NaN 8.0 + + Aggregate different functions over the columns and rename the index of the resulting + DataFrame. + + >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) + A B C + x 7.0 NaN NaN + y NaN 2.0 NaN + z NaN NaN 6.0 + + Aggregate over the columns. + + >>> df.agg("mean", axis="columns") + 0 2.0 + 1 5.0 + 2 8.0 + 3 NaN + dtype: float64 + """ + ) + + @doc( + _shared_docs["aggregate"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + see_also=_agg_summary_and_see_also_doc, + examples=_agg_examples_doc, + ) + def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): + from pandas.core.apply import frame_apply + + axis = self._get_axis_number(axis) + + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + + op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) + result = op.agg() + + if relabeling: + # This is to keep the order to columns occurrence unchanged, and also + # keep the order of new columns occurrence unchanged + + # For the return values of reconstruct_func, if relabeling is + # False, columns and order will be None. + assert columns is not None + assert order is not None + + result_in_dict = relabel_result(result, func, columns, order) + result = DataFrame(result_in_dict, index=columns) + + return result + + agg = aggregate + + @doc( + _shared_docs["transform"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> DataFrame: + from pandas.core.apply import frame_apply + + op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) + result = op.transform() + assert isinstance(result, DataFrame) + return result + + @validate_bool_kwargs_from_keywords('raw') + def apply( + self, + func: AggFuncType, + axis: Axis = 0, + raw: bool = False, + result_type=None, + args=(), + **kwargs, + ): + """ + Apply a function along an axis of the DataFrame. + + Objects passed to the function are Series objects whose index is + either the DataFrame's index (``axis=0``) or the DataFrame's columns + (``axis=1``). By default (``result_type=None``), the final return type + is inferred from the return type of the applied function. Otherwise, + it depends on the `result_type` argument. + + Parameters + ---------- + func : function + Function to apply to each column or row. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis along which the function is applied: + + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. + + raw : bool, default False + Determines if row or column is passed as a Series or ndarray object: + + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray objects + instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + + result_type : {'expand', 'reduce', 'broadcast', None}, default None + These only act when ``axis=1`` (columns): + + * 'expand' : list-like results will be turned into columns. + * 'reduce' : returns a Series if possible rather than expanding + list-like results. This is the opposite of 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the DataFrame, the original index and columns will be + retained. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. + args : tuple + Positional arguments to pass to `func` in addition to the + array/series. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns + ------- + Series or DataFrame + Result of applying ``func`` along the given axis of the + DataFrame. + + See Also + -------- + DataFrame.applymap: For elementwise operations. + DataFrame.aggregate: Only perform aggregating type operations. + DataFrame.transform: Only perform transforming type operations. + + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) + >>> df + A B + 0 4 9 + 1 4 9 + 2 4 9 + + Using a numpy universal function (in this case the same as + ``np.sqrt(df)``): + + >>> df.apply(np.sqrt) + A B + 0 2.0 3.0 + 1 2.0 3.0 + 2 2.0 3.0 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 12 + B 27 + dtype: int64 + + >>> df.apply(np.sum, axis=1) + 0 13 + 1 13 + 2 13 + dtype: int64 + + Returning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + dtype: object + + Passing ``result_type='expand'`` will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + + Returning a Series inside the function is similar to passing + ``result_type='expand'``. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + + Passing ``result_type='broadcast'`` will ensure the same shape + result, whether list-like or scalar is returned by the function, + and broadcast it along the axis. The resulting column names will + be the originals. + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') + A B + 0 1 2 + 1 1 2 + 2 1 2 + """ + from pandas.core.apply import frame_apply + + op = frame_apply( + self, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + args=args, + kwargs=kwargs, + ) + return op.apply().__finalize__(self, method="apply") + + def applymap( + self, func: PythonFuncType, na_action: str | None = None, **kwargs + ) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If ‘ignore’, propagate NaN values, without passing them to func. + + .. versionadded:: 1.2 + + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + Transformed DataFrame. + + See Also + -------- + DataFrame.apply : Apply a function along input axis of DataFrame. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df + 0 1 + 0 1.000 2.120 + 1 3.356 4.567 + + >>> df.applymap(lambda x: len(str(x))) + 0 1 + 0 3 4 + 1 5 5 + + Like Series.map, NA values can be ignored: + + >>> df_copy = df.copy() + >>> df_copy.iloc[0, 0] = pd.NA + >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') + 0 1 + 0 NaN 4 + 1 5.0 5 + + Note that a vectorized version of `func` often exists, which will + be much faster. You could square each number elementwise. + + >>> df.applymap(lambda x: x**2) + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + + But it's better to avoid applymap in that case. + + >>> df ** 2 + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + """ + if na_action not in {"ignore", None}: + raise ValueError( + f"na_action must be 'ignore' or None. Got {repr(na_action)}" + ) + ignore_na = na_action == "ignore" + func = functools.partial(func, **kwargs) + + # if we have a dtype == 'M8[ns]', provide boxed values + def infer(x): + if x.empty: + return lib.map_infer(x, func, ignore_na=ignore_na) + return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) + + return self.apply(infer).__finalize__(self, "applymap") + + # ---------------------------------------------------------------------- + # Merging / joining methods + + @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity', 'sort') + def append( + self, + other, + ignore_index: bool = False, + verify_integrity: bool = False, + sort: bool = False, + ) -> DataFrame: + """ + Append rows of `other` to the end of caller, returning a new object. + + .. deprecated:: 1.4.0 + Use :func:`concat` instead. For further details see + :ref:`whatsnew_140.deprecations.frame_series_append` + + Columns in `other` that are not in the caller are added as new columns. + + Parameters + ---------- + other : DataFrame or Series/dict-like object, or list of these + The data to append. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + verify_integrity : bool, default False + If True, raise ValueError on creating index with duplicates. + sort : bool, default False + Sort columns if the columns of `self` and `other` are not aligned. + + .. versionchanged:: 1.0.0 + + Changed to not sort by default. + + Returns + ------- + DataFrame + A new DataFrame consisting of the rows of caller and the rows of `other`. + + See Also + -------- + concat : General function to concatenate DataFrame or Series objects. + + Notes + ----- + If a list of dict/series is passed and the keys are all contained in + the DataFrame's index, the order of the columns in the resulting + DataFrame will be unchanged. + + Iteratively appending rows to a DataFrame can be more computationally + intensive than a single concatenate. A better solution is to append + those rows to a list and then concatenate the list with the original + DataFrame all at once. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y']) + >>> df + A B + x 1 2 + y 3 4 + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y']) + >>> df.append(df2) + A B + x 1 2 + y 3 4 + x 5 6 + y 7 8 + + With `ignore_index` set to True: + + >>> df.append(df2, ignore_index=True) + A B + 0 1 2 + 1 3 4 + 2 5 6 + 3 7 8 + + The following, while not recommended methods for generating DataFrames, + show two ways to generate a DataFrame from multiple data sources. + + Less efficient: + + >>> df = pd.DataFrame(columns=['A']) + >>> for i in range(5): + ... df = df.append({'A': i}, ignore_index=True) + >>> df + A + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + + More efficient: + + >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], + ... ignore_index=True) + A + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + """ + warnings.warn( + "The frame.append method is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.concat instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return self._append(other, ignore_index, verify_integrity, sort) + + @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity', 'sort') + def _append( + self, + other, + ignore_index: bool = False, + verify_integrity: bool = False, + sort: bool = False, + ) -> DataFrame: + combined_columns = None + if isinstance(other, (Series, dict)): + if isinstance(other, dict): + if not ignore_index: + raise TypeError("Can only append a dict if ignore_index=True") + other = Series(other) + if other.name is None and not ignore_index: + raise TypeError( + "Can only append a Series if ignore_index=True " + "or if the Series has a name" + ) + + index = Index([other.name], name=self.index.name) + idx_diff = other.index.difference(self.columns) + combined_columns = self.columns.append(idx_diff) + row_df = other.to_frame().T + # infer_objects is needed for + # test_append_empty_frame_to_series_with_dateutil_tz + other = row_df.infer_objects().rename_axis(index.names, copy=False) + elif isinstance(other, list): + if not other: + pass + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) + if self.index.name is not None and not ignore_index: + other.index.name = self.index.name + + from pandas.core.reshape.concat import concat + + if isinstance(other, (list, tuple)): + to_concat = [self, *other] + else: + to_concat = [self, other] + + result = concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) + if ( + combined_columns is not None + and not sort + and not combined_columns.equals(result.columns) + ): + # TODO: reindexing here is a kludge bc union_indexes does not + # pass sort to index.union, xref #43375 + # combined_columns.equals check is necessary for preserving dtype + # in test_crosstab_normalize + result = result.reindex(combined_columns, axis=1) + return result.__finalize__(self, method="append") + + @validate_bool_kwargs_from_keywords('sort') + def join( + self, + other: DataFrame | Series, + on: IndexLabel | None = None, + how: str = "left", + lsuffix: str = "", + rsuffix: str = "", + sort: bool = False, + validate: str | None = None, + ) -> DataFrame: + """ + Join columns of another DataFrame. + + Join columns with `other` DataFrame either on index or on a key + column. Efficiently join multiple DataFrame objects by index at once by + passing a list. + + Parameters + ---------- + other : DataFrame, Series, or list of DataFrame + Index should be similar to one of the columns in this one. If a + Series is passed, its name attribute must be set, and that will be + used as the column name in the resulting joined DataFrame. + on : str, list of str, or array-like, optional + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation. + how : {'left', 'right', 'outer', 'inner'}, default 'left' + How to handle the operation of the two objects. + + * left: use calling frame's index (or column if on is specified) + * right: use `other`'s index. + * outer: form union of calling frame's index (or column if on is + specified) with `other`'s index, and sort it. + lexicographically. + * inner: form intersection of calling frame's index (or column if + on is specified) with `other`'s index, preserving the order + of the calling's one. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + + .. versionadded:: 1.2.0 + + lsuffix : str, default '' + Suffix to use from left frame's overlapping columns. + rsuffix : str, default '' + Suffix to use from right frame's overlapping columns. + sort : bool, default False + Order result DataFrame lexicographically by the join key. If False, + the order of the join key depends on the join type (how keyword). + validate : str, optional + If specified, checks if join is of specified type. + * "one_to_one" or "1:1": check if join keys are unique in both left + and right datasets. + * "one_to_many" or "1:m": check if join keys are unique in left dataset. + * "many_to_one" or "m:1": check if join keys are unique in right dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame + A dataframe containing columns from both the caller and `other`. + + See Also + -------- + DataFrame.merge : For column(s)-on-column(s) operations. + + Notes + ----- + Parameters `on`, `lsuffix`, and `rsuffix` are not supported when + passing a list of `DataFrame` objects. + + Support for specifying index levels as the `on` parameter was added + in version 0.23.0. + + Examples + -------- + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df + key A + 0 K0 A0 + 1 K1 A1 + 2 K2 A2 + 3 K3 A3 + 4 K4 A4 + 5 K5 A5 + + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], + ... 'B': ['B0', 'B1', 'B2']}) + + >>> other + key B + 0 K0 B0 + 1 K1 B1 + 2 K2 B2 + + Join DataFrames using their indexes. + + >>> df.join(other, lsuffix='_caller', rsuffix='_other') + key_caller A key_other B + 0 K0 A0 K0 B0 + 1 K1 A1 K1 B1 + 2 K2 A2 K2 B2 + 3 K3 A3 NaN NaN + 4 K4 A4 NaN NaN + 5 K5 A5 NaN NaN + + If we want to join using the key columns, we need to set key to be + the index in both `df` and `other`. The joined DataFrame will have + key as its index. + + >>> df.set_index('key').join(other.set_index('key')) + A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the `on` + parameter. DataFrame.join always uses `other`'s index but we can use + any column in `df`. This method preserves the original DataFrame's + index in the result. + + >>> df.join(other.set_index('key'), on='key') + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K2 A2 B2 + 3 K3 A3 NaN + 4 K4 A4 NaN + 5 K5 A5 NaN + + Using non-unique key values shows how they are matched. + + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df + key A + 0 K0 A0 + 1 K1 A1 + 2 K1 A2 + 3 K3 A3 + 4 K0 A4 + 5 K1 A5 + + >>> df.join(other.set_index('key'), on='key', validate='m:1') + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K1 A2 B1 + 3 K3 A3 NaN + 4 K0 A4 B0 + 5 K1 A5 B1 + """ + return self._join_compat( + other, + on=on, + how=how, + lsuffix=lsuffix, + rsuffix=rsuffix, + sort=sort, + validate=validate, + ) + + @validate_bool_kwargs_from_keywords('sort') + def _join_compat( + self, + other: DataFrame | Series, + on: IndexLabel | None = None, + how: str = "left", + lsuffix: str = "", + rsuffix: str = "", + sort: bool = False, + validate: str | None = None, + ): + from pandas.core.reshape.concat import concat + from pandas.core.reshape.merge import merge + + if isinstance(other, Series): + if other.name is None: + raise ValueError("Other Series must have a name") + other = DataFrame({other.name: other}) + + if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + validate=validate, + ) + return merge( + self, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort, + validate=validate, + ) + else: + if on is not None: + raise ValueError( + "Joining multiple DataFrames only supported for joining on index" + ) + + if rsuffix or lsuffix: + raise ValueError( + "Suffixes not supported when joining multiple DataFrames" + ) + + frames = [self] + list(other) + + can_concat = all(df.index.is_unique for df in frames) + + # join indexes only using concat + if can_concat: + if how == "left": + res = concat( + frames, axis=1, join="outer", verify_integrity=True, sort=sort + ) + return res.reindex(self.index, copy=False) + else: + return concat( + frames, axis=1, join=how, verify_integrity=True, sort=sort + ) + + joined = frames[0] + + for frame in frames[1:]: + joined = merge( + joined, + frame, + how=how, + left_index=True, + right_index=True, + validate=validate, + ) + + return joined + + @Substitution("") + @Appender(_merge_doc, indents=2) + @validate_bool_kwargs_from_keywords('left_index', 'right_index', 'copy', 'indicator', 'sort') + def merge( + self, + right: DataFrame | Series, + how: str = "inner", + on: IndexLabel | None = None, + left_on: IndexLabel | None = None, + right_on: IndexLabel | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Suffixes = ("_x", "_y"), + copy: bool = True, + indicator: bool = False, + validate: str | None = None, + ) -> DataFrame: + from pandas.core.reshape.merge import merge + + return merge( + self, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) + + def round( + self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs + ) -> DataFrame: + """ + Round a DataFrame to a variable number of decimal places. + + Parameters + ---------- + decimals : int, dict, Series + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. + *args + Additional keywords have no effect but might be accepted for + compatibility with numpy. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + DataFrame + A DataFrame with the affected columns rounded to the specified + number of decimal places. + + See Also + -------- + numpy.around : Round a numpy array to the given number of decimals. + Series.round : Round a Series to the given number of decimals. + + Examples + -------- + >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats']) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + """ + from pandas.core.reshape.concat import concat + + def _dict_round(df: DataFrame, decimals): + for col, vals in df.items(): + try: + yield _series_round(vals, decimals[col]) + except KeyError: + yield vals + + def _series_round(ser: Series, decimals: int): + if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): + return ser.round(decimals) + return ser + + nv.validate_round(args, kwargs) + + if isinstance(decimals, (dict, Series)): + if isinstance(decimals, Series) and not decimals.index.is_unique: + raise ValueError("Index of decimals must be unique") + if is_dict_like(decimals) and not all( + is_integer(value) for _, value in decimals.items() + ): + raise TypeError("Values in decimals must be integers") + new_cols = list(_dict_round(self, decimals)) + elif is_integer(decimals): + # Dispatch to Series.round + new_cols = [_series_round(v, decimals) for _, v in self.items()] + else: + raise TypeError("decimals must be an integer, a dict-like or a Series") + + if len(new_cols) > 0: + return self._constructor( + concat(new_cols, axis=1), index=self.index, columns=self.columns + ).__finalize__(self, method="round") + else: + return self + + # ---------------------------------------------------------------------- + # Statistical methods, etc. + + @validate_bool_kwargs_from_keywords('numeric_only') + def corr( + self, + method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", + min_periods: int = 1, + numeric_only: bool = True, + ) -> DataFrame: + """ + Compute pairwise correlation of columns, excluding NA/null values. + + Parameters + ---------- + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. Note that the returned matrix from corr + will have 1 along the diagonals and will be symmetric + regardless of the callable's behavior. + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. Currently only available for Pearson + and Spearman correlation. + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame + Correlation matrix. + + See Also + -------- + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Series.corr : Compute the correlation between two Series. + + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ + + Examples + -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v + >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], + ... columns=['dogs', 'cats']) + >>> df.corr(method=histogram_intersection) + dogs cats + dogs 1.0 0.3 + cats 0.3 1.0 + + >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], + ... columns=['dogs', 'cats']) + >>> df.corr(min_periods=3) + dogs cats + dogs 1.0 NaN + cats NaN 1.0 + """ # noqa:E501 + if numeric_only: + data = self._get_numeric_data() + else: + data = self + cols = data.columns + idx = cols.copy() + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + + if method == "pearson": + correl = libalgos.nancorr(mat, minp=min_periods) + elif method == "spearman": + correl = libalgos.nancorr_spearman(mat, minp=min_periods) + elif method == "kendall" or callable(method): + if min_periods is None: + min_periods = 1 + mat = mat.T + corrf = nanops.get_corr_func(method) + K = len(cols) + correl = np.empty((K, K), dtype=float) + mask = np.isfinite(mat) + for i, ac in enumerate(mat): + for j, bc in enumerate(mat): + if i > j: + continue + + valid = mask[i] & mask[j] + if valid.sum() < min_periods: + c = np.nan + elif i == j: + c = 1.0 + elif not valid.all(): + c = corrf(ac[valid], bc[valid]) + else: + c = corrf(ac, bc) + correl[i, j] = c + correl[j, i] = c + else: + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + f"'{method}' was supplied" + ) + + return self._constructor(correl, index=idx, columns=cols) + + @validate_bool_kwargs_from_keywords('numeric_only') + def cov( + self, + min_periods: int | None = None, + ddof: int | None = 1, + numeric_only: bool = True, + ) -> DataFrame: + """ + Compute pairwise covariance of columns, excluding NA/null values. + + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. + + Parameters + ---------- + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + .. versionadded:: 1.1.0 + + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame + The covariance matrix of the series of the DataFrame. + + See Also + -------- + Series.cov : Compute covariance with another Series. + core.window.ExponentialMovingWindow.cov: Exponential weighted sample covariance. + core.window.Expanding.cov : Expanding sample covariance. + core.window.Rolling.cov : Rolling sample covariance. + + Notes + ----- + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-ddof. + + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + + Examples + -------- + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + + **Minimum number of periods** + + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 + """ + if numeric_only: + data = self._get_numeric_data() + else: + data = self + cols = data.columns + idx = cols.copy() + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + + if notna(mat).all(): + if min_periods is not None and min_periods > len(mat): + base_cov = np.empty((mat.shape[1], mat.shape[1])) + base_cov.fill(np.nan) + else: + base_cov = np.cov(mat.T, ddof=ddof) + base_cov = base_cov.reshape((len(cols), len(cols))) + else: + base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) + + return self._constructor(base_cov, index=idx, columns=cols) + + @validate_bool_kwargs_from_keywords('numeric_only') + def corrwith( + self, + other, + axis: Axis = 0, + drop=False, + method="pearson", + numeric_only: bool = True, + ) -> Series: + """ + Compute pairwise correlation. + + Pairwise correlation is computed between rows or columns of + DataFrame with rows or columns of Series or DataFrame. DataFrames + are first aligned along both axes before computing the + correlations. + + Parameters + ---------- + other : DataFrame, Series + Object with which to compute correlations. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for + row-wise. + drop : bool, default False + Drop missing indices from result. + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. + + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Pairwise correlations. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation of columns. + + Examples + -------- + >>> index = ["a", "b", "c", "d", "e"] + >>> columns = ["one", "two", "three", "four"] + >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) + >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) + >>> df1.corrwith(df2) + one 1.0 + two 1.0 + three 1.0 + four 1.0 + dtype: float64 + + >>> df2.corrwith(df1, axis=1) + a 1.0 + b 1.0 + c 1.0 + d 1.0 + e NaN + dtype: float64 + """ # noqa:E501 + axis = self._get_axis_number(axis) + if numeric_only: + this = self._get_numeric_data() + else: + this = self + + # GH46174: when other is a Series object and axis=0, we achieve a speedup over + # passing .corr() to .apply() by taking the columns as ndarrays and iterating + # over the transposition row-wise. Then we delegate the correlation coefficient + # computation and null-masking to np.corrcoef and np.isnan respectively, + # which are much faster. We exploit the fact that the Spearman correlation + # of two vectors is equal to the Pearson correlation of their ranks to use + # substantially the same method for Pearson and Spearman, + # just with intermediate argsorts on the latter. + if isinstance(other, Series): + if axis == 0 and method in ["pearson", "spearman"]: + corrs = {} + if numeric_only: + cols = self.select_dtypes(include=np.number).columns + ndf = self[cols].values.transpose() + else: + cols = self.columns + ndf = self.values.transpose() + k = other.values + if method == "pearson": + for i, r in enumerate(ndf): + nonnull_mask = ~np.isnan(r) & ~np.isnan(k) + corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ + 0, 1 + ] + else: + for i, r in enumerate(ndf): + nonnull_mask = ~np.isnan(r) & ~np.isnan(k) + corrs[cols[i]] = np.corrcoef( + r[nonnull_mask].argsort().argsort(), + k[nonnull_mask].argsort().argsort(), + )[0, 1] + return Series(corrs) + else: + return this.apply(lambda x: other.corr(x, method=method), axis=axis) + + other = other._get_numeric_data() + left, right = this.align(other, join="inner", copy=False) + + if axis == 1: + left = left.T + right = right.T + + if method == "pearson": + # mask missing values + left = left + right * 0 + right = right + left * 0 + + # demeaned data + ldem = left - left.mean() + rdem = right - right.mean() + + num = (ldem * rdem).sum() + dom = (left.count() - 1) * left.std() * right.std() + + correl = num / dom + + elif method in ["kendall", "spearman"] or callable(method): + + def c(x): + return nanops.nancorr(x[0], x[1], method=method) + + correl = self._constructor_sliced( + map(c, zip(left.values.T, right.values.T)), index=left.columns + ) + + else: + raise ValueError( + f"Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable" + ) + + if not drop: + # Find non-matching labels along the given axis + # and append missing correlations (GH 22375) + raxis = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + @validate_bool_kwargs_from_keywords('numeric_only') + def count( + self, axis: Axis = 0, level: Level | None = None, numeric_only: bool = False + ): + """ + Count non-NA cells for each column or row. + + The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending + on `pandas.options.mode.use_inf_as_na`) are considered NA. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index' counts are generated for each column. + If 1 or 'columns' counts are generated for each row. + level : int or str, optional + If the axis is a `MultiIndex` (hierarchical), count along a + particular `level`, collapsing into a `DataFrame`. + A `str` specifies the level name. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + Returns + ------- + Series or DataFrame + For each column/row the number of non-NA/null entries. + If `level` is specified returns a `DataFrame`. + + See Also + -------- + Series.count: Number of non-NA elements in a Series. + DataFrame.value_counts: Count unique combinations of columns. + DataFrame.shape: Number of DataFrame rows and columns (including NA + elements). + DataFrame.isna: Boolean same-sized DataFrame showing places of NA + elements. + + Examples + -------- + Constructing DataFrame from a dictionary: + + >>> df = pd.DataFrame({"Person": + ... ["John", "Myla", "Lewis", "John", "Myla"], + ... "Age": [24., np.nan, 21., 33, 26], + ... "Single": [False, True, True, True, False]}) + >>> df + Person Age Single + 0 John 24.0 False + 1 Myla NaN True + 2 Lewis 21.0 True + 3 John 33.0 True + 4 Myla 26.0 False + + Notice the uncounted NA values: + + >>> df.count() + Person 5 + Age 4 + Single 5 + dtype: int64 + + Counts for each **row**: + + >>> df.count(axis='columns') + 0 3 + 1 2 + 2 3 + 3 3 + 4 3 + dtype: int64 + """ + axis = self._get_axis_number(axis) + if level is not None: + warnings.warn( + "Using the level keyword in DataFrame and Series aggregations is " + "deprecated and will be removed in a future version. Use groupby " + "instead. df.count(level=1) should use df.groupby(level=1).count().", + FutureWarning, + stacklevel=find_stack_level(), + ) + res = self._count_level(level, axis=axis, numeric_only=numeric_only) + return res.__finalize__(self, method="count") + + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + # GH #423 + if len(frame._get_axis(axis)) == 0: + result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) + else: + if frame._is_mixed_type or frame._mgr.any_extension_types: + # the or any_extension_types is really only hit for single- + # column frames with an extension array + result = notna(frame).sum(axis=axis) + else: + # GH13407 + series_counts = notna(frame).sum(axis=axis) + counts = series_counts.values + result = self._constructor_sliced( + counts, index=frame._get_agg_axis(axis) + ) + + return result.astype("int64").__finalize__(self, method="count") + + @validate_bool_kwargs_from_keywords('numeric_only') + def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + count_axis = frame._get_axis(axis) + agg_axis = frame._get_agg_axis(axis) + + if not isinstance(count_axis, MultiIndex): + raise TypeError( + f"Can only count levels on hierarchical {self._get_axis_name(axis)}." + ) + + # Mask NaNs: Mask rows or columns where the index level is NaN, and all + # values in the DataFrame that are NaN + if frame._is_mixed_type: + # Since we have mixed types, calling notna(frame.values) might + # upcast everything to object + values_mask = notna(frame).values + else: + # But use the speedup when we have homogeneous dtypes + values_mask = notna(frame.values) + + index_mask = notna(count_axis.get_level_values(level=level)) + if axis == 1: + mask = index_mask & values_mask + else: + mask = index_mask.reshape(-1, 1) & values_mask + + if isinstance(level, str): + level = count_axis._get_level_number(level) + + level_name = count_axis._names[level] + level_index = count_axis.levels[level]._rename(name=level_name) + level_codes = ensure_platform_int(count_axis.codes[level]) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) + + if axis == 1: + result = self._constructor(counts, index=agg_axis, columns=level_index) + else: + result = self._constructor(counts, index=level_index, columns=agg_axis) + + return result + + @validate_bool_kwargs_from_keywords('numeric_only') + def _reduce( + self, + op, + name: str, + *, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool | None = None, + filter_type=None, + **kwds, + ): + + assert filter_type is None or filter_type == "bool", filter_type + out_dtype = "bool" if filter_type == "bool" else None + + if numeric_only is None and name in ["mean", "median"]: + own_dtypes = [arr.dtype for arr in self._mgr.arrays] + + dtype_is_dt = np.array( + [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], + dtype=bool, + ) + if dtype_is_dt.any(): + warnings.warn( + "DataFrame.mean and DataFrame.median with numeric_only=None " + "will include datetime64 and datetime64tz columns in a " + "future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # Non-copy equivalent to + # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype) + # cols = self.columns[~dt64_cols] + # self = self[cols] + predicate = lambda x: not is_datetime64_any_dtype(x.dtype) + mgr = self._mgr._get_data_subset(predicate) + self = type(self)(mgr) + + # TODO: Make other agg func handle axis=None properly GH#21597 + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + assert axis in [0, 1] + + def func(values: np.ndarray): + # We only use this in the case that operates on self.values + return op(values, axis=axis, skipna=skipna, **kwds) + + def blk_func(values, axis=1): + if isinstance(values, ExtensionArray): + if not is_1d_only_ea_dtype(values.dtype) and not isinstance( + self._mgr, ArrayManager + ): + return values._reduce(name, axis=1, skipna=skipna, **kwds) + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=axis, skipna=skipna, **kwds) + + def _get_data() -> DataFrame: + if filter_type is None: + data = self._get_numeric_data() + else: + # GH#25101, GH#24434 + assert filter_type == "bool" + data = self._get_bool_data() + return data + + if numeric_only is not None or axis == 0: + # For numeric_only non-None and axis non-None, we know + # which blocks to use and no try/except is needed. + # For numeric_only=None only the case with axis==0 and no object + # dtypes are unambiguous can be handled with BlockManager.reduce + # Case with EAs see GH#35881 + df = self + if numeric_only is True: + df = _get_data() + if axis == 1: + df = df.T + axis = 0 + + ignore_failures = numeric_only is None + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager.reduce + res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) + out = df._constructor(res).iloc[0] + if out_dtype is not None: + out = out.astype(out_dtype) + if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) + + if numeric_only is None and out.shape[0] != df.shape[1]: + # columns have been dropped GH#41480 + arg_name = "numeric_only" + if name in ["all", "any"]: + arg_name = "bool_only" + warnings.warn( + "Dropping of nuisance columns in DataFrame reductions " + f"(with '{arg_name}=None') is deprecated; in a future " + "version this will raise TypeError. Select only valid " + "columns before calling the reduction.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return out + + assert numeric_only is None + + data = self + values = data.values + + try: + result = func(values) + + except TypeError: + # e.g. in nanops trying to convert strs to float + + data = _get_data() + labels = data._get_agg_axis(axis) + + values = data.values + with np.errstate(all="ignore"): + result = func(values) + + # columns have been dropped GH#41480 + arg_name = "numeric_only" + if name in ["all", "any"]: + arg_name = "bool_only" + warnings.warn( + "Dropping of nuisance columns in DataFrame reductions " + f"(with '{arg_name}=None') is deprecated; in a future " + "version this will raise TypeError. Select only valid " + "columns before calling the reduction.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if hasattr(result, "dtype"): + if filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) + elif filter_type is None and is_object_dtype(result.dtype): + try: + result = result.astype(np.float64) + except (ValueError, TypeError): + # try to coerce to the original dtypes item by item if we can + pass + + result = self._constructor_sliced(result, index=labels) + return result + + @validate_bool_kwargs_from_keywords('skipna') + def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: + """ + Special case for _reduce to try to avoid a potentially-expensive transpose. + + Apply the reduction block-wise along axis=1 and then reduce the resulting + 1D arrays. + """ + if name == "all": + result = np.ones(len(self), dtype=bool) + ufunc = np.logical_and + elif name == "any": + result = np.zeros(len(self), dtype=bool) + # error: Incompatible types in assignment + # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], + # Literal[20], Literal[False]]", variable has type + # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], + # Literal[True]]") + ufunc = np.logical_or # type: ignore[assignment] + else: + raise NotImplementedError(name) + + for arr in self._mgr.arrays: + middle = func(arr, axis=0, skipna=skipna) + result = ufunc(result, middle) + + res_ser = self._constructor_sliced(result, index=self.index) + return res_ser + + @validate_bool_kwargs_from_keywords('dropna') + def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: + """ + Count number of distinct elements in specified axis. + + Return Series with number of distinct elements. Can ignore NaN + values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + See Also + -------- + Series.nunique: Method nunique for Series. + DataFrame.count: Count non-NA cells for each column or row. + + Examples + -------- + >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df.nunique() + A 3 + B 2 + dtype: int64 + + >>> df.nunique(axis=1) + 0 1 + 1 2 + 2 2 + dtype: int64 + """ + return self.apply(Series.nunique, axis=axis, dropna=dropna) + + @doc(_shared_docs["idxmin"], numeric_only_default="False") + @validate_bool_kwargs_from_keywords('skipna', 'numeric_only') + def idxmin( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: + axis = self._get_axis_number(axis) + if numeric_only: + data = self._get_numeric_data() + else: + data = self + + res = data._reduce( + nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False + ) + indices = res._values + + # indices will always be np.ndarray since axis is not None and + # values is a 2d array for DataFrame + # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" + assert isinstance(indices, np.ndarray) # for mypy + + index = data._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return data._constructor_sliced(result, index=data._get_agg_axis(axis)) + + @doc(_shared_docs["idxmax"], numeric_only_default="False") + @validate_bool_kwargs_from_keywords('skipna', 'numeric_only') + def idxmax( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: + + axis = self._get_axis_number(axis) + if numeric_only: + data = self._get_numeric_data() + else: + data = self + + res = data._reduce( + nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False + ) + indices = res._values + + # indices will always be np.ndarray since axis is not None and + # values is a 2d array for DataFrame + # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" + assert isinstance(indices, np.ndarray) # for mypy + + index = data._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return data._constructor_sliced(result, index=data._get_agg_axis(axis)) + + def _get_agg_axis(self, axis_num: int) -> Index: + """ + Let's be explicit about this. + """ + if axis_num == 0: + return self.columns + elif axis_num == 1: + return self.index + else: + raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") + + @validate_bool_kwargs_from_keywords('skipna', 'dropna') + def mode( + self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True + ) -> DataFrame: + """ + Get the mode(s) of each element along the selected axis. + + The mode of a set of values is the value that appears most often. + It can be multiple values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to iterate over while searching for the mode: + + * 0 or 'index' : get mode of each column + * 1 or 'columns' : get mode of each row. + + numeric_only : bool, default False + If True, only apply to numeric columns. + dropna : bool, default True + Don't consider counts of NaN/NaT. + + Returns + ------- + DataFrame + The modes of each column or row. + + See Also + -------- + Series.mode : Return the highest frequency value in a Series. + Series.value_counts : Return the counts of values in a Series. + + Examples + -------- + >>> df = pd.DataFrame([('bird', 2, 2), + ... ('mammal', 4, np.nan), + ... ('arthropod', 8, 0), + ... ('bird', 2, np.nan)], + ... index=('falcon', 'horse', 'spider', 'ostrich'), + ... columns=('species', 'legs', 'wings')) + >>> df + species legs wings + falcon bird 2 2.0 + horse mammal 4 NaN + spider arthropod 8 0.0 + ostrich bird 2 NaN + + By default, missing values are not considered, and the mode of wings + are both 0 and 2. Because the resulting DataFrame has two rows, + the second row of ``species`` and ``legs`` contains ``NaN``. + + >>> df.mode() + species legs wings + 0 bird 2.0 0.0 + 1 NaN NaN 2.0 + + Setting ``dropna=False`` ``NaN`` values are considered and they can be + the mode (like for wings). + + >>> df.mode(dropna=False) + species legs wings + 0 bird 2 NaN + + Setting ``numeric_only=True``, only the mode of numeric columns is + computed, and columns of other types are ignored. + + >>> df.mode(numeric_only=True) + legs wings + 0 2.0 0.0 + 1 NaN 2.0 + + To compute the mode over columns and not rows, use the axis parameter: + + >>> df.mode(axis='columns', numeric_only=True) + 0 1 + falcon 2.0 NaN + horse 4.0 NaN + spider 0.0 8.0 + ostrich 2.0 NaN + """ + data = self if not numeric_only else self._get_numeric_data() + + def f(s): + return s.mode(dropna=dropna) + + data = data.apply(f, axis=axis) + # Ensure index is type stable (should always use int index) + if data.empty: + data.index = default_index(0) + + return data + + def quantile( + self, + q=0.5, + axis: Axis = 0, + numeric_only: bool | lib.NoDefault = no_default, + interpolation: str = "linear", + ): + """ + Return values at the given quantile over requested axis. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value between 0 <= q <= 1, the quantile(s) to compute. + axis : {0, 1, 'index', 'columns'}, default 0 + Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + numeric_only : bool, default True + If False, the quantile of datetime and timedelta data will be + computed as well. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + + Returns + ------- + Series or DataFrame + + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + + See Also + -------- + core.window.Rolling.quantile: Rolling quantile. + numpy.percentile: Numpy function to compute the percentile. + + Examples + -------- + >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + Specifying `numeric_only=False` will also compute the quantile of + datetime and timedelta data. + + >>> df = pd.DataFrame({'A': [1, 2], + ... 'B': [pd.Timestamp('2010'), + ... pd.Timestamp('2011')], + ... 'C': [pd.Timedelta('1 days'), + ... pd.Timedelta('2 days')]}) + >>> df.quantile(0.5, numeric_only=False) + A 1.5 + B 2010-07-02 12:00:00 + C 1 days 12:00:00 + Name: 0.5, dtype: object + """ + validate_percentile(q) + axis = self._get_axis_number(axis) + any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes) + if numeric_only is no_default and any_not_numeric: + warnings.warn( + "In future versions of pandas, numeric_only will be set to " + "False by default, and the datetime/timedelta columns will " + "be considered in the results. To not consider these columns" + "specify numeric_only=True.", + FutureWarning, + stacklevel=find_stack_level(), + ) + numeric_only = True + + if not is_list_like(q): + # BlockManager.quantile expects listlike, so we wrap and unwrap here + res_df = self.quantile( + [q], axis=axis, numeric_only=numeric_only, interpolation=interpolation + ) + res = res_df.iloc[0] + if axis == 1 and len(self) == 0: + # GH#41544 try to get an appropriate dtype + dtype = find_common_type(list(self.dtypes)) + if needs_i8_conversion(dtype): + return res.astype(dtype) + return res + + q = Index(q, dtype=np.float64) + data = self._get_numeric_data() if numeric_only else self + + if axis == 1: + data = data.T + + if len(data.columns) == 0: + # GH#23925 _get_numeric_data may have dropped all columns + cols = Index([], name=self.columns.name) + + dtype = np.float64 + if axis == 1: + # GH#41544 try to get an appropriate dtype + cdtype = find_common_type(list(self.dtypes)) + if needs_i8_conversion(cdtype): + dtype = cdtype + + if is_list_like(q): + res = self._constructor([], index=q, columns=cols, dtype=dtype) + return res.__finalize__(self, method="quantile") + return self._constructor_sliced([], index=cols, name=q, dtype=dtype) + + res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) + + result = self._constructor(res) + return result.__finalize__(self, method="quantile") + + @doc(NDFrame.asfreq, **_shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('normalize') + def asfreq( + self, + freq: Frequency, + method=None, + how: str | None = None, + normalize: bool = False, + fill_value=None, + ) -> DataFrame: + return super().asfreq( + freq=freq, + method=method, + how=how, + normalize=normalize, + fill_value=fill_value, + ) + + @doc(NDFrame.resample, **_shared_doc_kwargs) + def resample( + self, + rule, + axis=0, + closed: str | None = None, + label: str | None = None, + convention: str = "start", + kind: str | None = None, + loffset=None, + base: int | None = None, + on=None, + level=None, + origin: str | TimestampConvertibleTypes = "start_day", + offset: TimedeltaConvertibleTypes | None = None, + group_keys: bool | lib.NoDefault = no_default, + ) -> Resampler: + return super().resample( + rule=rule, + axis=axis, + closed=closed, + label=label, + convention=convention, + kind=kind, + loffset=loffset, + base=base, + on=on, + level=level, + origin=origin, + offset=offset, + group_keys=group_keys, + ) + + @validate_bool_kwargs_from_keywords('copy') + def to_timestamp( + self, + freq: Frequency | None = None, + how: str = "start", + axis: Axis = 0, + copy: bool = True, + ) -> DataFrame: + """ + Cast to DatetimeIndex of timestamps, at *beginning* of period. + + Parameters + ---------- + freq : str, default frequency of PeriodIndex + Desired frequency. + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. + + Returns + ------- + DataFrame with DatetimeIndex + """ + new_obj = self.copy(deep=copy) + + axis_name = self._get_axis_name(axis) + old_ax = getattr(self, axis_name) + if not isinstance(old_ax, PeriodIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + + new_ax = old_ax.to_timestamp(freq=freq, how=how) + + setattr(new_obj, axis_name, new_ax) + return new_obj + + @validate_bool_kwargs_from_keywords('copy') + def to_period( + self, freq: Frequency | None = None, axis: Axis = 0, copy: bool = True + ) -> DataFrame: + """ + Convert DataFrame from DatetimeIndex to PeriodIndex. + + Convert DataFrame from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed). + + Parameters + ---------- + freq : str, default + Frequency of the PeriodIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. + + Returns + ------- + DataFrame with PeriodIndex + + Examples + -------- + >>> idx = pd.to_datetime( + ... [ + ... "2001-03-31 00:00:00", + ... "2002-05-31 00:00:00", + ... "2003-08-31 00:00:00", + ... ] + ... ) + + >>> idx + DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], + dtype='datetime64[ns]', freq=None) + + >>> idx.to_period("M") + PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') + + For the yearly frequency + + >>> idx.to_period("Y") + PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') + """ + new_obj = self.copy(deep=copy) + + axis_name = self._get_axis_name(axis) + old_ax = getattr(self, axis_name) + if not isinstance(old_ax, DatetimeIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + + new_ax = old_ax.to_period(freq=freq) + + setattr(new_obj, axis_name, new_ax) + return new_obj + + def isin(self, values) -> DataFrame: + """ + Whether each element in the DataFrame is contained in values. + + Parameters + ---------- + values : iterable, Series, DataFrame or dict + The result will only be true at a location if all the + labels match. If `values` is a Series, that's the index. If + `values` is a dict, the keys must be the column names, + which must match. If `values` is a DataFrame, + then both the index and column labels must match. + + Returns + ------- + DataFrame + DataFrame of booleans showing whether each element in the DataFrame + is contained in values. + + See Also + -------- + DataFrame.eq: Equality test for DataFrame. + Series.isin: Equivalent method on Series. + Series.str.contains: Test if pattern or regex is contained within a + string of a Series or Index. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, + ... index=['falcon', 'dog']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + + When ``values`` is a list check whether every value in the DataFrame + is present in the list (which animals have 0 or 2 legs or wings) + + >>> df.isin([0, 2]) + num_legs num_wings + falcon True True + dog False True + + To check if ``values`` is *not* in the DataFrame, use the ``~`` operator: + + >>> ~df.isin([0, 2]) + num_legs num_wings + falcon False False + dog True False + + When ``values`` is a dict, we can pass values to check for each + column separately: + + >>> df.isin({'num_wings': [0, 3]}) + num_legs num_wings + falcon False False + dog False True + + When ``values`` is a Series or DataFrame the index and column must + match. Note that 'falcon' does not match based on the number of legs + in other. + + >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, + ... index=['spider', 'falcon']) + >>> df.isin(other) + num_legs num_wings + falcon False True + dog False False + """ + if isinstance(values, dict): + from pandas.core.reshape.concat import concat + + values = collections.defaultdict(list, values) + result = concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) + elif isinstance(values, Series): + if not values.index.is_unique: + raise ValueError("cannot compute isin with a duplicate axis.") + result = self.eq(values.reindex_like(self), axis="index") + elif isinstance(values, DataFrame): + if not (values.columns.is_unique and values.index.is_unique): + raise ValueError("cannot compute isin with a duplicate axis.") + result = self.eq(values.reindex_like(self)) + else: + if not is_list_like(values): + raise TypeError( + "only list-like or dict-like objects are allowed " + "to be passed to DataFrame.isin(), " + f"you passed a '{type(values).__name__}'" + ) + result = self._constructor( + algorithms.isin(self.values.ravel(), values).reshape(self.shape), + self.index, + self.columns, + ) + return result.__finalize__(self, method="isin") + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number = 1 + _info_axis_name = "columns" + + index: Index = properties.AxisProperty( + axis=1, doc="The index (row labels) of the DataFrame." + ) + columns: Index = properties.AxisProperty( + axis=0, doc="The column labels of the DataFrame." + ) + + @property + def _AXIS_NUMBERS(self) -> dict[str, int]: + """.. deprecated:: 1.1.0""" + super()._AXIS_NUMBERS + return {"index": 0, "columns": 1} + + @property + def _AXIS_NAMES(self) -> dict[int, str]: + """.. deprecated:: 1.1.0""" + super()._AXIS_NAMES + return {0: "index", 1: "columns"} + + # ---------------------------------------------------------------------- + # Add plotting methods to DataFrame + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + hist = pandas.plotting.hist_frame + boxplot = pandas.plotting.boxplot_frame + sparse = CachedAccessor("sparse", SparseFrameAccessor) + + # ---------------------------------------------------------------------- + # Internal Interface Methods + + @validate_bool_kwargs_from_keywords('copy') + def _to_dict_of_blocks(self, copy: bool = True): + """ + Return a dict of dtype -> Constructor Types that + each is a homogeneous dtype. + + Internal ONLY - only works for BlockManager + """ + mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = mgr_to_mgr(mgr, "block") + mgr = cast(BlockManager, mgr) + return { + k: self._constructor(v).__finalize__(self) + for k, v, in mgr.to_dict(copy=copy).items() + } + + @property + def values(self) -> np.ndarray: + """ + Return a Numpy representation of the DataFrame. + + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + + Only the values in the DataFrame will be returned, the axes labels + will be removed. + + Returns + ------- + numpy.ndarray + The values of the DataFrame. + + See Also + -------- + DataFrame.to_numpy : Recommended alternative to this method. + DataFrame.index : Retrieve the index labels. + DataFrame.columns : Retrieving the column names. + + Notes + ----- + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By :func:`numpy.find_common_type` convention, mixing int64 + and uint64 will result in a float64 dtype. + + Examples + -------- + A DataFrame where all columns are the same type (e.g., int64) results + in an array of the same type. + + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) + >>> df + age height weight + 0 3 94 31 + 1 29 170 115 + >>> df.dtypes + age int64 + height int64 + weight int64 + dtype: object + >>> df.values + array([[ 3, 94, 31], + [ 29, 170, 115]]) + + A DataFrame with mixed type columns(e.g., str/object, int64, float32) + results in an ndarray of the broadest type that accommodates these + mixed types (e.g., object). + + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) + >>> df2.dtypes + name object + max_speed float64 + rank object + dtype: object + >>> df2.values + array([['parrot', 24.0, 'second'], + ['lion', 80.5, 1], + ['monkey', nan, None]], dtype=object) + """ + self._consolidate_inplace() + return self._mgr.as_array() + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') + def ffill( + self: DataFrame, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> DataFrame | None: + return super().ffill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') + def bfill( + self: DataFrame, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> DataFrame | None: + return super().bfill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "lower", "upper"] + ) + @validate_bool_kwargs_from_keywords('inplace') + def clip( + self: DataFrame, + lower=None, + upper=None, + axis: Axis | None = None, + inplace: bool = False, + *args, + **kwargs, + ) -> DataFrame | None: + return super().clip(lower, upper, axis, inplace, *args, **kwargs) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) + @validate_bool_kwargs_from_keywords('inplace') + def interpolate( + self: DataFrame, + method: str = "linear", + axis: Axis = 0, + limit: int | None = None, + inplace: bool = False, + limit_direction: str | None = None, + limit_area: str | None = None, + downcast: str | None = None, + **kwargs, + ) -> DataFrame | None: + return super().interpolate( + method, + axis, + limit, + inplace, + limit_direction, + limit_area, + downcast, + **kwargs, + ) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def where( + self, + cond, + other=lib.no_default, + inplace=False, + axis=None, + level=None, + errors: IgnoreRaise = "raise", + try_cast=lib.no_default, + ): + return super().where(cond, other, inplace, axis, level, errors, try_cast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors: IgnoreRaise = "raise", + try_cast=lib.no_default, + ): + return super().mask(cond, other, inplace, axis, level, errors, try_cast) + + +DataFrame._add_numeric_operations() + +ops.add_flex_arithmetic_methods(DataFrame) + + +def _from_nested_dict(data) -> collections.defaultdict: + new_data: collections.defaultdict = collections.defaultdict(dict) + for index, s in data.items(): + for col, v in s.items(): + new_data[col][index] = v + return new_data + + +def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: + # reindex if necessary + + if value.index.equals(index) or not len(index): + return value._values.copy() + + # GH#4107 + try: + reindexed_value = value.reindex(index)._values + except ValueError as err: + # raised in MultiIndex.from_tuples, see test_insert_error_msmgs + if not value.index.is_unique: + # duplicate axis + raise err + + raise TypeError( + "incompatible index of inserted column with frame index" + ) from err + return reindexed_value From 6f1c2d6ce529ccfd4cc1413e8187c81946dcb7d6 Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 01:07:51 -0600 Subject: [PATCH 2/8] Delete frame.py --- frame.py | 11445 ----------------------------------------------------- 1 file changed, 11445 deletions(-) delete mode 100644 frame.py diff --git a/frame.py b/frame.py deleted file mode 100644 index cf6ff527dc8d3..0000000000000 --- a/frame.py +++ /dev/null @@ -1,11445 +0,0 @@ -""" -DataFrame ---------- -An efficient 2D container for potentially mixed-type time series or other -labeled data series. - -Similar to its R counterpart, data.frame, except providing automatic data -alignment and a host of useful data manipulation methods having to do with the -labeling information -""" -from __future__ import annotations - -import collections -from collections import abc -import datetime -import functools -from io import StringIO -import itertools -from textwrap import dedent -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Hashable, - Iterable, - Iterator, - Literal, - Sequence, - cast, - overload, -) -import warnings - -import numpy as np -import numpy.ma as ma - -from pandas._config import get_option - -from pandas._libs import ( - algos as libalgos, - lib, - properties, -) -from pandas._libs.hashtable import duplicated -from pandas._libs.lib import ( - NoDefault, - no_default, -) -from pandas._typing import ( - AggFuncType, - AnyArrayLike, - ArrayLike, - Axes, - Axis, - ColspaceArgType, - CompressionOptions, - Dtype, - DtypeObj, - FilePath, - FillnaOptions, - FloatFormatType, - FormattersType, - Frequency, - IgnoreRaise, - IndexKeyFunc, - IndexLabel, - Level, - NaPosition, - PythonFuncType, - ReadBuffer, - Renamer, - Scalar, - SortKind, - StorageOptions, - Suffixes, - TimedeltaConvertibleTypes, - TimestampConvertibleTypes, - ValueKeyFunc, - WriteBuffer, - npt, -) -from pandas.compat._optional import import_optional_dependency -from pandas.compat.numpy import function as nv -from pandas.util._decorators import ( - Appender, - Substitution, - deprecate_kwarg, - deprecate_nonkeyword_arguments, - doc, - rewrite_axis_style_signature, -) -from pandas.util._exceptions import find_stack_level -from pandas.util._validators import ( - validate_ascending, - validate_axis_style_args, - validate_bool_kwarg, - validate_percentile, - validate_bool_kwargs_from_keywords, -) - -from pandas.core.dtypes.cast import ( - LossySetitemError, - can_hold_element, - construct_1d_arraylike_from_scalar, - construct_2d_arraylike_from_scalar, - find_common_type, - infer_dtype_from_scalar, - invalidate_string_dtypes, - maybe_box_native, - maybe_downcast_to_dtype, -) -from pandas.core.dtypes.common import ( - ensure_platform_int, - infer_dtype_from_object, - is_1d_only_ea_dtype, - is_bool_dtype, - is_dataclass, - is_datetime64_any_dtype, - is_dict_like, - is_dtype_equal, - is_extension_array_dtype, - is_float, - is_float_dtype, - is_hashable, - is_integer, - is_integer_dtype, - is_iterator, - is_list_like, - is_numeric_dtype, - is_object_dtype, - is_scalar, - is_sequence, - needs_i8_conversion, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.missing import ( - isna, - notna, -) - -from pandas.core import ( - algorithms, - common as com, - nanops, - ops, -) -from pandas.core.accessor import CachedAccessor -from pandas.core.apply import ( - reconstruct_func, - relabel_result, -) -from pandas.core.array_algos.take import take_2d_multi -from pandas.core.arraylike import OpsMixin -from pandas.core.arrays import ( - DatetimeArray, - ExtensionArray, - PeriodArray, - TimedeltaArray, -) -from pandas.core.arrays.sparse import SparseFrameAccessor -from pandas.core.construction import ( - extract_array, - sanitize_array, - sanitize_masked_array, -) -from pandas.core.generic import NDFrame -from pandas.core.indexers import check_key_length -from pandas.core.indexes.api import ( - DatetimeIndex, - Index, - PeriodIndex, - default_index, - ensure_index, - ensure_index_from_sequences, -) -from pandas.core.indexes.multi import ( - MultiIndex, - maybe_droplevels, -) -from pandas.core.indexing import ( - check_bool_indexer, - check_deprecated_indexers, - convert_to_index_sliceable, -) -from pandas.core.internals import ( - ArrayManager, - BlockManager, -) -from pandas.core.internals.construction import ( - arrays_to_mgr, - dataclasses_to_dicts, - dict_to_mgr, - mgr_to_mgr, - ndarray_to_mgr, - nested_data_to_arrays, - rec_array_to_mgr, - reorder_arrays, - to_arrays, - treat_as_nested, -) -from pandas.core.reshape.melt import melt -from pandas.core.series import Series -from pandas.core.shared_docs import _shared_docs -from pandas.core.sorting import ( - get_group_index, - lexsort_indexer, - nargsort, -) - -from pandas.io.common import get_handle -from pandas.io.formats import ( - console, - format as fmt, -) -from pandas.io.formats.info import ( - INFO_DOCSTRING, - DataFrameInfo, - frame_sub_kwargs, -) -import pandas.plotting - -if TYPE_CHECKING: - - from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg - from pandas.core.groupby.generic import DataFrameGroupBy - from pandas.core.internals import SingleDataManager - from pandas.core.resample import Resampler - - from pandas.io.formats.style import Styler - -# --------------------------------------------------------------------- -# Docstring templates - -_shared_doc_kwargs = { - "axes": "index, columns", - "klass": "DataFrame", - "axes_single_arg": "{0 or 'index', 1 or 'columns'}", - "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 - If 0 or 'index': apply function to each column. - If 1 or 'columns': apply function to each row.""", - "inplace": """ - inplace : bool, default False - If True, performs operation inplace and returns None.""", - "optional_by": """ - by : str or list of str - Name or list of names to sort by. - - - if `axis` is 0 or `'index'` then `by` may contain index - levels and/or column labels. - - if `axis` is 1 or `'columns'` then `by` may contain column - levels and/or index labels.""", - "optional_labels": """labels : array-like, optional - New labels / index to conform the axis specified by 'axis' to.""", - "optional_axis": """axis : int or str, optional - Axis to target. Can be either the axis name ('index', 'columns') - or number (0, 1).""", - "replace_iloc": """ - This differs from updating with ``.loc`` or ``.iloc``, which require - you to specify a location to update with some value.""", -} - -_numeric_only_doc = """numeric_only : bool or None, default None - Include only float, int, boolean data. If None, will attempt to use - everything, then use only numeric data -""" - -_merge_doc = """ -Merge DataFrame or named Series objects with a database-style join. - -A named Series object is treated as a DataFrame with a single named column. - -The join is done on columns or indexes. If joining columns on -columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes -on indexes or indexes on a column or columns, the index will be passed on. -When performing a cross merge, no column specifications to merge on are -allowed. - -.. warning:: - - If both key columns contain rows where the key is a null value, those - rows will be matched against each other. This is different from usual SQL - join behaviour and can lead to unexpected results. - -Parameters -----------%s -right : DataFrame or named Series - Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' - Type of merge to be performed. - - * left: use only keys from left frame, similar to a SQL left outer join; - preserve key order. - * right: use only keys from right frame, similar to a SQL right outer join; - preserve key order. - * outer: use union of keys from both frames, similar to a SQL full outer - join; sort keys lexicographically. - * inner: use intersection of keys from both frames, similar to a SQL inner - join; preserve the order of the left keys. - * cross: creates the cartesian product from both frames, preserves the order - of the left keys. - - .. versionadded:: 1.2.0 - -on : label or list - Column or index level names to join on. These must be found in both - DataFrames. If `on` is None and not merging on indexes then this defaults - to the intersection of the columns in both DataFrames. -left_on : label or list, or array-like - Column or index level names to join on in the left DataFrame. Can also - be an array or list of arrays of the length of the left DataFrame. - These arrays are treated as if they are columns. -right_on : label or list, or array-like - Column or index level names to join on in the right DataFrame. Can also - be an array or list of arrays of the length of the right DataFrame. - These arrays are treated as if they are columns. -left_index : bool, default False - Use the index from the left DataFrame as the join key(s). If it is a - MultiIndex, the number of keys in the other DataFrame (either the index - or a number of columns) must match the number of levels. -right_index : bool, default False - Use the index from the right DataFrame as the join key. Same caveats as - left_index. -sort : bool, default False - Sort the join keys lexicographically in the result DataFrame. If False, - the order of the join keys depends on the join type (how keyword). -suffixes : list-like, default is ("_x", "_y") - A length-2 sequence where each element is optionally a string - indicating the suffix to add to overlapping column names in - `left` and `right` respectively. Pass a value of `None` instead - of a string to indicate that the column name from `left` or - `right` should be left as-is, with no suffix. At least one of the - values must not be None. -copy : bool, default True - If False, avoid copy if possible. -indicator : bool or str, default False - If True, adds a column to the output DataFrame called "_merge" with - information on the source of each row. The column can be given a different - name by providing a string argument. The column will have a Categorical - type with the value of "left_only" for observations whose merge key only - appears in the left DataFrame, "right_only" for observations - whose merge key only appears in the right DataFrame, and "both" - if the observation's merge key is found in both DataFrames. - -validate : str, optional - If specified, checks if merge is of specified type. - - * "one_to_one" or "1:1": check if merge keys are unique in both - left and right datasets. - * "one_to_many" or "1:m": check if merge keys are unique in left - dataset. - * "many_to_one" or "m:1": check if merge keys are unique in right - dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. - -Returns -------- -DataFrame - A DataFrame of the two merged objects. - -See Also --------- -merge_ordered : Merge with optional filling/interpolation. -merge_asof : Merge on nearest keys. -DataFrame.join : Similar method using indices. - -Notes ------ -Support for specifying index levels as the `on`, `left_on`, and -`right_on` parameters was added in version 0.23.0 -Support for merging named Series objects was added in version 0.24.0 - -Examples --------- ->>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], -... 'value': [1, 2, 3, 5]}) ->>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], -... 'value': [5, 6, 7, 8]}) ->>> df1 - lkey value -0 foo 1 -1 bar 2 -2 baz 3 -3 foo 5 ->>> df2 - rkey value -0 foo 5 -1 bar 6 -2 baz 7 -3 foo 8 - -Merge df1 and df2 on the lkey and rkey columns. The value columns have -the default suffixes, _x and _y, appended. - ->>> df1.merge(df2, left_on='lkey', right_on='rkey') - lkey value_x rkey value_y -0 foo 1 foo 5 -1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 - -Merge DataFrames df1 and df2 with specified left and right suffixes -appended to any overlapping columns. - ->>> df1.merge(df2, left_on='lkey', right_on='rkey', -... suffixes=('_left', '_right')) - lkey value_left rkey value_right -0 foo 1 foo 5 -1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 - -Merge DataFrames df1 and df2, but raise an exception if the DataFrames have -any overlapping columns. - ->>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) -Traceback (most recent call last): -... -ValueError: columns overlap but no suffix specified: - Index(['value'], dtype='object') - ->>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) ->>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) ->>> df1 - a b -0 foo 1 -1 bar 2 ->>> df2 - a c -0 foo 3 -1 baz 4 - ->>> df1.merge(df2, how='inner', on='a') - a b c -0 foo 1 3 - ->>> df1.merge(df2, how='left', on='a') - a b c -0 foo 1 3.0 -1 bar 2 NaN - ->>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) ->>> df2 = pd.DataFrame({'right': [7, 8]}) ->>> df1 - left -0 foo -1 bar ->>> df2 - right -0 7 -1 8 - ->>> df1.merge(df2, how='cross') - left right -0 foo 7 -1 foo 8 -2 bar 7 -3 bar 8 -""" - - -# ----------------------------------------------------------------------- -# DataFrame class - - -class DataFrame(NDFrame, OpsMixin): - """ - Two-dimensional, size-mutable, potentially heterogeneous tabular data. - - Data structure also contains labeled axes (rows and columns). - Arithmetic operations align on both row and column labels. Can be - thought of as a dict-like container for Series objects. The primary - pandas data structure. - - Parameters - ---------- - data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame - Dict can contain Series, arrays, constants, dataclass or list-like objects. If - data is a dict, column order follows insertion-order. If a dict contains Series - which have an index defined, it is aligned by its index. - - .. versionchanged:: 0.25.0 - If data is a list of dicts, column order follows insertion-order. - - index : Index or array-like - Index to use for resulting frame. Will default to RangeIndex if - no indexing information part of input data and no index provided. - columns : Index or array-like - Column labels to use for resulting frame when data does not have them, - defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, - will perform column selection instead. - dtype : dtype, default None - Data type to force. Only a single dtype is allowed. If None, infer. - copy : bool or None, default None - Copy data from inputs. - For dict data, the default of None behaves like ``copy=True``. For DataFrame - or 2d ndarray input, the default of None behaves like ``copy=False``. - If data is a dict containing one or more Series (possibly of different dtypes), - ``copy=False`` will ensure that these inputs are not copied. - - .. versionchanged:: 1.3.0 - - See Also - -------- - DataFrame.from_records : Constructor from tuples, also record arrays. - DataFrame.from_dict : From dicts of Series, arrays, or dicts. - read_csv : Read a comma-separated values (csv) file into DataFrame. - read_table : Read general delimited file into DataFrame. - read_clipboard : Read text from clipboard into DataFrame. - - Notes - ----- - Please reference the :ref:`User Guide ` for more information. - - Examples - -------- - Constructing DataFrame from a dictionary. - - >>> d = {'col1': [1, 2], 'col2': [3, 4]} - >>> df = pd.DataFrame(data=d) - >>> df - col1 col2 - 0 1 3 - 1 2 4 - - Notice that the inferred dtype is int64. - - >>> df.dtypes - col1 int64 - col2 int64 - dtype: object - - To enforce a single dtype: - - >>> df = pd.DataFrame(data=d, dtype=np.int8) - >>> df.dtypes - col1 int8 - col2 int8 - dtype: object - - Constructing DataFrame from a dictionary including Series: - - >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} - >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) - col1 col2 - 0 0 NaN - 1 1 NaN - 2 2 2.0 - 3 3 3.0 - - Constructing DataFrame from numpy ndarray: - - >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - ... columns=['a', 'b', 'c']) - >>> df2 - a b c - 0 1 2 3 - 1 4 5 6 - 2 7 8 9 - - Constructing DataFrame from a numpy ndarray that has labeled columns: - - >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], - ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) - >>> df3 = pd.DataFrame(data, columns=['c', 'a']) - ... - >>> df3 - c a - 0 3 1 - 1 6 4 - 2 9 7 - - Constructing DataFrame from dataclass: - - >>> from dataclasses import make_dataclass - >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) - >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) - x y - 0 0 0 - 1 0 3 - 2 2 3 - """ - - _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set - _typ = "dataframe" - _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) - _accessors: set[str] = {"sparse"} - _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) - _mgr: BlockManager | ArrayManager - - @property - def _constructor(self) -> Callable[..., DataFrame]: - return DataFrame - - _constructor_sliced: Callable[..., Series] = Series - - # ---------------------------------------------------------------------- - # Constructors - - def __init__( - self, - data=None, - index: Axes | None = None, - columns: Axes | None = None, - dtype: Dtype | None = None, - copy: bool | None = None, - ) -> None: - - if data is None: - data = {} - if dtype is not None: - dtype = self._validate_dtype(dtype) - - if isinstance(data, DataFrame): - data = data._mgr - - if isinstance(data, (BlockManager, ArrayManager)): - # first check if a Manager is passed without any other arguments - # -> use fastpath (without checking Manager type) - if index is None and columns is None and dtype is None and not copy: - # GH#33357 fastpath - NDFrame.__init__(self, data) - return - - manager = get_option("mode.data_manager") - - if copy is None: - if isinstance(data, dict): - # retain pre-GH#38939 default behavior - copy = True - elif ( - manager == "array" - and isinstance(data, (np.ndarray, ExtensionArray)) - and data.ndim == 2 - ): - # INFO(ArrayManager) by default copy the 2D input array to get - # contiguous 1D arrays - copy = True - else: - copy = False - - if isinstance(data, (BlockManager, ArrayManager)): - mgr = self._init_mgr( - data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy - ) - - elif isinstance(data, dict): - # GH#38939 de facto copy defaults to False only in non-dict cases - mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) - elif isinstance(data, ma.MaskedArray): - import numpy.ma.mrecords as mrecords - - # masked recarray - if isinstance(data, mrecords.MaskedRecords): - mgr = rec_array_to_mgr( - data, - index, - columns, - dtype, - copy, - typ=manager, - ) - warnings.warn( - "Support for MaskedRecords is deprecated and will be " - "removed in a future version. Pass " - "{name: data[name] for name in data.dtype.names} instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # a masked array - else: - data = sanitize_masked_array(data) - mgr = ndarray_to_mgr( - data, - index, - columns, - dtype=dtype, - copy=copy, - typ=manager, - ) - - elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): - if data.dtype.names: - # i.e. numpy structured array - data = cast(np.ndarray, data) - mgr = rec_array_to_mgr( - data, - index, - columns, - dtype, - copy, - typ=manager, - ) - elif getattr(data, "name", None) is not None: - # i.e. Series/Index with non-None name - mgr = dict_to_mgr( - # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no - # attribute "name" - {data.name: data}, # type: ignore[union-attr] - index, - columns, - dtype=dtype, - typ=manager, - ) - else: - mgr = ndarray_to_mgr( - data, - index, - columns, - dtype=dtype, - copy=copy, - typ=manager, - ) - - # For data is list-like, or Iterable (will consume into list) - elif is_list_like(data): - if not isinstance(data, (abc.Sequence, ExtensionArray)): - if hasattr(data, "__array__"): - # GH#44616 big perf improvement for e.g. pytorch tensor - data = np.asarray(data) - else: - data = list(data) - if len(data) > 0: - if is_dataclass(data[0]): - data = dataclasses_to_dicts(data) - if not isinstance(data, np.ndarray) and treat_as_nested(data): - # exclude ndarray as we may have cast it a few lines above - if columns is not None: - # error: Argument 1 to "ensure_index" has incompatible type - # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray, - # ndarray], Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] - arrays, columns, index = nested_data_to_arrays( - # error: Argument 3 to "nested_data_to_arrays" has incompatible - # type "Optional[Collection[Any]]"; expected "Optional[Index]" - data, - columns, - index, # type: ignore[arg-type] - dtype, - ) - mgr = arrays_to_mgr( - arrays, - columns, - index, - dtype=dtype, - typ=manager, - ) - else: - mgr = ndarray_to_mgr( - data, - index, - columns, - dtype=dtype, - copy=copy, - typ=manager, - ) - else: - mgr = dict_to_mgr( - {}, - index, - columns, - dtype=dtype, - typ=manager, - ) - # For data is scalar - else: - if index is None or columns is None: - raise ValueError("DataFrame constructor not properly called!") - - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - index = ensure_index(index) # type: ignore[arg-type] - # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; - # expected "Union[Union[Union[ExtensionArray, ndarray], - # Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] - - if not dtype: - dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) - - # For data is a scalar extension dtype - if isinstance(dtype, ExtensionDtype): - # TODO(EA2D): special case not needed with 2D EAs - - values = [ - construct_1d_arraylike_from_scalar(data, len(index), dtype) - for _ in range(len(columns)) - ] - mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) - else: - arr2d = construct_2d_arraylike_from_scalar( - data, - len(index), - len(columns), - dtype, - copy, - ) - - mgr = ndarray_to_mgr( - arr2d, - index, - columns, - dtype=arr2d.dtype, - copy=False, - typ=manager, - ) - - # ensure correct Manager type according to settings - mgr = mgr_to_mgr(mgr, typ=manager) - - NDFrame.__init__(self, mgr) - - # ---------------------------------------------------------------------- - @validate_bool_kwargs_from_keywords('nan_as_null', 'allow_copy') - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ) -> DataFrameXchg: - """ - Return the dataframe exchange object implementing the exchange protocol. - - Parameters - ---------- - nan_as_null : bool, default False - Whether to tell the DataFrame to overwrite null values in the data - with ``NaN`` (or ``NaT``). - allow_copy : bool, default True - Whether to allow memory copying when exporting. If set to False - it would cause non-zero-copy exports to fail. - - Returns - ------- - DataFrame exchange object - The object which consuming library can use to ingress the dataframe. - - Notes - ----- - Details on the exchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - `nan_as_null` currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - """ - - from pandas.core.exchange.dataframe import PandasDataFrameXchg - - return PandasDataFrameXchg(self, nan_as_null, allow_copy) - - # ---------------------------------------------------------------------- - - @property - def axes(self) -> list[Index]: - """ - Return a list representing the axes of the DataFrame. - - It has the row axis labels and column axis labels as the only members. - They are returned in that order. - - Examples - -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - >>> df.axes - [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], - dtype='object')] - """ - return [self.index, self.columns] - - @property - def shape(self) -> tuple[int, int]: - """ - Return a tuple representing the dimensionality of the DataFrame. - - See Also - -------- - ndarray.shape : Tuple of array dimensions. - - Examples - -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - >>> df.shape - (2, 2) - - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], - ... 'col3': [5, 6]}) - >>> df.shape - (2, 3) - """ - return len(self.index), len(self.columns) - - @property - def _is_homogeneous_type(self) -> bool: - """ - Whether all the columns in a DataFrame have the same type. - - Returns - ------- - bool - - See Also - -------- - Index._is_homogeneous_type : Whether the object has a single - dtype. - MultiIndex._is_homogeneous_type : Whether all the levels of a - MultiIndex have the same dtype. - - Examples - -------- - >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type - True - >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type - False - - Items with the same type but different sizes are considered - different types. - - >>> DataFrame({ - ... "A": np.array([1, 2], dtype=np.int32), - ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type - False - """ - if isinstance(self._mgr, ArrayManager): - return len({arr.dtype for arr in self._mgr.arrays}) == 1 - if self._mgr.any_extension_types: - return len({block.dtype for block in self._mgr.blocks}) == 1 - else: - return not self._is_mixed_type - - @property - def _can_fast_transpose(self) -> bool: - """ - Can we transpose this DataFrame without creating any new array objects. - """ - if isinstance(self._mgr, ArrayManager): - return False - blocks = self._mgr.blocks - if len(blocks) != 1: - return False - - dtype = blocks[0].dtype - # TODO(EA2D) special case would be unnecessary with 2D EAs - return not is_1d_only_ea_dtype(dtype) - - # error: Return type "Union[ndarray, DatetimeArray, TimedeltaArray]" of - # "_values" incompatible with return type "ndarray" in supertype "NDFrame" - @property - def _values( # type: ignore[override] - self, - ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: - """ - Analogue to ._values that may return a 2D ExtensionArray. - """ - self._consolidate_inplace() - - mgr = self._mgr - - if isinstance(mgr, ArrayManager): - if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): - # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" - # has no attribute "reshape" - return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] - return self.values - - blocks = mgr.blocks - if len(blocks) != 1: - return self.values - - arr = blocks[0].values - if arr.ndim == 1: - # non-2D ExtensionArray - return self.values - - # more generally, whatever we allow in NDArrayBackedExtensionBlock - arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) - return arr.T - - # ---------------------------------------------------------------------- - # Rendering Methods - - def _repr_fits_vertical_(self) -> bool: - """ - Check length against max_rows. - """ - max_rows = get_option("display.max_rows") - return len(self) <= max_rows - - @validate_bool_kwargs_from_keywords('ignore_width') - def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: - """ - Check if full repr fits in horizontal boundaries imposed by the display - options width and max_columns. - - In case of non-interactive session, no boundaries apply. - - `ignore_width` is here so ipynb+HTML output can behave the way - users expect. display.max_columns remains in effect. - GH3541, GH3573 - """ - width, height = console.get_console_size() - max_columns = get_option("display.max_columns") - nb_columns = len(self.columns) - - # exceed max columns - if (max_columns and nb_columns > max_columns) or ( - (not ignore_width) and width and nb_columns > (width // 2) - ): - return False - - # used by repr_html under IPython notebook or scripts ignore terminal - # dims - if ignore_width or not console.in_interactive_session(): - return True - - if get_option("display.width") is not None or console.in_ipython_frontend(): - # check at least the column row for excessive width - max_rows = 1 - else: - max_rows = get_option("display.max_rows") - - # when auto-detecting, so width=None and not in ipython front end - # check whether repr fits horizontal by actually checking - # the width of the rendered repr - buf = StringIO() - - # only care about the stuff we'll actually print out - # and to_string on entire frame may be expensive - d = self - - if max_rows is not None: # unlimited rows - # min of two, where one may be None - d = d.iloc[: min(max_rows, len(d))] - else: - return True - - d.to_string(buf=buf) - value = buf.getvalue() - repr_width = max(len(line) for line in value.split("\n")) - - return repr_width < width - - def _info_repr(self) -> bool: - """ - True if the repr should show the info view. - """ - info_repr_option = get_option("display.large_repr") == "info" - return info_repr_option and not ( - self._repr_fits_horizontal_() and self._repr_fits_vertical_() - ) - - def __repr__(self) -> str: - """ - Return a string representation for a particular DataFrame. - """ - if self._info_repr(): - buf = StringIO() - self.info(buf=buf) - return buf.getvalue() - - repr_params = fmt.get_dataframe_repr_params() - return self.to_string(**repr_params) - - def _repr_html_(self) -> str | None: - """ - Return a html representation for a particular DataFrame. - - Mainly for IPython notebook. - """ - if self._info_repr(): - buf = StringIO() - self.info(buf=buf) - # need to escape the , should be the first line. - val = buf.getvalue().replace("<", r"<", 1) - val = val.replace(">", r">", 1) - return "
" + val + "
" - - if get_option("display.notebook_repr_html"): - max_rows = get_option("display.max_rows") - min_rows = get_option("display.min_rows") - max_cols = get_option("display.max_columns") - show_dimensions = get_option("display.show_dimensions") - - formatter = fmt.DataFrameFormatter( - self, - columns=None, - col_space=None, - na_rep="NaN", - formatters=None, - float_format=None, - sparsify=None, - justify=None, - index_names=True, - header=True, - index=True, - bold_rows=True, - escape=True, - max_rows=max_rows, - min_rows=min_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=".", - ) - return fmt.DataFrameRenderer(formatter).to_html(notebook=True) - else: - return None - - @overload - def to_string( - self, - buf: None = ..., - columns: Sequence[str] | None = ..., - col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | Sequence[str] = ..., - index: bool = ..., - na_rep: str = ..., - formatters: fmt.FormattersType | None = ..., - float_format: fmt.FloatFormatType | None = ..., - sparsify: bool | None = ..., - index_names: bool = ..., - justify: str | None = ..., - max_rows: int | None = ..., - max_cols: int | None = ..., - show_dimensions: bool = ..., - decimal: str = ..., - line_width: int | None = ..., - min_rows: int | None = ..., - max_colwidth: int | None = ..., - encoding: str | None = ..., - ) -> str: - ... - - @overload - def to_string( - self, - buf: FilePath | WriteBuffer[str], - columns: Sequence[str] | None = ..., - col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | Sequence[str] = ..., - index: bool = ..., - na_rep: str = ..., - formatters: fmt.FormattersType | None = ..., - float_format: fmt.FloatFormatType | None = ..., - sparsify: bool | None = ..., - index_names: bool = ..., - justify: str | None = ..., - max_rows: int | None = ..., - max_cols: int | None = ..., - show_dimensions: bool = ..., - decimal: str = ..., - line_width: int | None = ..., - min_rows: int | None = ..., - max_colwidth: int | None = ..., - encoding: str | None = ..., - ) -> None: - ... - - @Substitution( - header_type="bool or sequence of str", - header="Write out the column names. If a list of strings " - "is given, it is assumed to be aliases for the " - "column names", - col_space_type="int, list or dict of int", - col_space="The minimum width of each column. If a list of ints is given " - "every integers corresponds with one column. If a dict is given, the key " - "references the column, while the value defines the space to use.", - ) - @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) - @validate_bool_kwargs_from_keywords('index', 'index_names', 'show_dimensions') - def to_string( - self, - buf: FilePath | WriteBuffer[str] | None = None, - columns: Sequence[str] | None = None, - col_space: int | list[int] | dict[Hashable, int] | None = None, - header: bool | Sequence[str] = True, - index: bool = True, - na_rep: str = "NaN", - formatters: fmt.FormattersType | None = None, - float_format: fmt.FloatFormatType | None = None, - sparsify: bool | None = None, - index_names: bool = True, - justify: str | None = None, - max_rows: int | None = None, - max_cols: int | None = None, - show_dimensions: bool = False, - decimal: str = ".", - line_width: int | None = None, - min_rows: int | None = None, - max_colwidth: int | None = None, - encoding: str | None = None, - ) -> str | None: - """ - Render a DataFrame to a console-friendly tabular output. - %(shared_params)s - line_width : int, optional - Width to wrap a line in characters. - min_rows : int, optional - The number of rows to display in the console in a truncated repr - (when number of rows is above `max_rows`). - max_colwidth : int, optional - Max width to truncate each column in characters. By default, no limit. - - .. versionadded:: 1.0.0 - encoding : str, default "utf-8" - Set character encoding. - - .. versionadded:: 1.0 - %(returns)s - See Also - -------- - to_html : Convert DataFrame to HTML. - - Examples - -------- - >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} - >>> df = pd.DataFrame(d) - >>> print(df.to_string()) - col1 col2 - 0 1 4 - 1 2 5 - 2 3 6 - """ - from pandas import option_context - - with option_context("display.max_colwidth", max_colwidth): - formatter = fmt.DataFrameFormatter( - self, - columns=columns, - col_space=col_space, - na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, - justify=justify, - index_names=index_names, - header=header, - index=index, - min_rows=min_rows, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, - ) - return fmt.DataFrameRenderer(formatter).to_string( - buf=buf, - encoding=encoding, - line_width=line_width, - ) - - # ---------------------------------------------------------------------- - - @property - def style(self) -> Styler: - """ - Returns a Styler object. - - Contains methods for building a styled HTML representation of the DataFrame. - - See Also - -------- - io.formats.style.Styler : Helps style a DataFrame or Series according to the - data with HTML and CSS. - """ - from pandas.io.formats.style import Styler - - return Styler(self) - - _shared_docs[ - "items" - ] = r""" - Iterate over (column name, Series) pairs. - - Iterates over the DataFrame columns, returning a tuple with - the column name and the content as a Series. - - Yields - ------ - label : object - The column names for the DataFrame being iterated over. - content : Series - The column entries belonging to each label, as a Series. - - See Also - -------- - DataFrame.iterrows : Iterate over DataFrame rows as - (index, Series) pairs. - DataFrame.itertuples : Iterate over DataFrame rows as namedtuples - of the values. - - Examples - -------- - >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], - ... 'population': [1864, 22000, 80000]}, - ... index=['panda', 'polar', 'koala']) - >>> df - species population - panda bear 1864 - polar bear 22000 - koala marsupial 80000 - >>> for label, content in df.items(): - ... print(f'label: {label}') - ... print(f'content: {content}', sep='\n') - ... - label: species - content: - panda bear - polar bear - koala marsupial - Name: species, dtype: object - label: population - content: - panda 1864 - polar 22000 - koala 80000 - Name: population, dtype: int64 - """ - - @Appender(_shared_docs["items"]) - def items(self) -> Iterable[tuple[Hashable, Series]]: - if self.columns.is_unique and hasattr(self, "_item_cache"): - for k in self.columns: - yield k, self._get_item_cache(k) - else: - for i, k in enumerate(self.columns): - yield k, self._ixs(i, axis=1) - - @Appender(_shared_docs["items"]) - def iteritems(self) -> Iterable[tuple[Hashable, Series]]: - warnings.warn( - "iteritems is deprecated and will be removed in a future version. " - "Use .items instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - yield from self.items() - - def iterrows(self) -> Iterable[tuple[Hashable, Series]]: - """ - Iterate over DataFrame rows as (index, Series) pairs. - - Yields - ------ - index : label or tuple of label - The index of the row. A tuple for a `MultiIndex`. - data : Series - The data of the row as a Series. - - See Also - -------- - DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. - DataFrame.items : Iterate over (column name, Series) pairs. - - Notes - ----- - 1. Because ``iterrows`` returns a Series for each row, - it does **not** preserve dtypes across the rows (dtypes are - preserved across columns for DataFrames). For example, - - >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) - >>> row = next(df.iterrows())[1] - >>> row - int 1.0 - float 1.5 - Name: 0, dtype: float64 - >>> print(row['int'].dtype) - float64 - >>> print(df['int'].dtype) - int64 - - To preserve dtypes while iterating over the rows, it is better - to use :meth:`itertuples` which returns namedtuples of the values - and which is generally faster than ``iterrows``. - - 2. You should **never modify** something you are iterating over. - This is not guaranteed to work in all cases. Depending on the - data types, the iterator returns a copy and not a view, and writing - to it will have no effect. - """ - columns = self.columns - klass = self._constructor_sliced - for k, v in zip(self.index, self.values): - s = klass(v, index=columns, name=k).__finalize__(self) - yield k, s - @validate_bool_kwargs_from_keywords('index') - def itertuples( - self, index: bool = True, name: str | None = "Pandas" - ) -> Iterable[tuple[Any, ...]]: - """ - Iterate over DataFrame rows as namedtuples. - - Parameters - ---------- - index : bool, default True - If True, return the index as the first element of the tuple. - name : str or None, default "Pandas" - The name of the returned namedtuples or None to return regular - tuples. - - Returns - ------- - iterator - An object to iterate over namedtuples for each row in the - DataFrame with the first field possibly being the index and - following fields being the column values. - - See Also - -------- - DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) - pairs. - DataFrame.items : Iterate over (column name, Series) pairs. - - Notes - ----- - The column names will be renamed to positional names if they are - invalid Python identifiers, repeated, or start with an underscore. - - Examples - -------- - >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, - ... index=['dog', 'hawk']) - >>> df - num_legs num_wings - dog 4 0 - hawk 2 2 - >>> for row in df.itertuples(): - ... print(row) - ... - Pandas(Index='dog', num_legs=4, num_wings=0) - Pandas(Index='hawk', num_legs=2, num_wings=2) - - By setting the `index` parameter to False we can remove the index - as the first element of the tuple: - - >>> for row in df.itertuples(index=False): - ... print(row) - ... - Pandas(num_legs=4, num_wings=0) - Pandas(num_legs=2, num_wings=2) - - With the `name` parameter set we set a custom name for the yielded - namedtuples: - - >>> for row in df.itertuples(name='Animal'): - ... print(row) - ... - Animal(Index='dog', num_legs=4, num_wings=0) - Animal(Index='hawk', num_legs=2, num_wings=2) - """ - arrays = [] - fields = list(self.columns) - if index: - arrays.append(self.index) - fields.insert(0, "Index") - - # use integer indexing because of possible duplicate column names - arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - - if name is not None: - # https://github.com/python/mypy/issues/9046 - # error: namedtuple() expects a string literal as the first argument - itertuple = collections.namedtuple( # type: ignore[misc] - name, fields, rename=True - ) - return map(itertuple._make, zip(*arrays)) - - # fallback to regular tuples - return zip(*arrays) - - def __len__(self) -> int: - """ - Returns length of info axis, but here we use the index. - """ - return len(self.index) - - @overload - def dot(self, other: Series) -> Series: - ... - - @overload - def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: - ... - - def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: - """ - Compute the matrix multiplication between the DataFrame and other. - - This method computes the matrix product between the DataFrame and the - values of an other Series, DataFrame or a numpy array. - - It can also be called using ``self @ other`` in Python >= 3.5. - - Parameters - ---------- - other : Series, DataFrame or array-like - The other object to compute the matrix product with. - - Returns - ------- - Series or DataFrame - If other is a Series, return the matrix product between self and - other as a Series. If other is a DataFrame or a numpy.array, return - the matrix product of self and other in a DataFrame of a np.array. - - See Also - -------- - Series.dot: Similar method for Series. - - Notes - ----- - The dimensions of DataFrame and other must be compatible in order to - compute the matrix multiplication. In addition, the column names of - DataFrame and the index of other must contain the same values, as they - will be aligned prior to the multiplication. - - The dot method for Series computes the inner product, instead of the - matrix product here. - - Examples - -------- - Here we multiply a DataFrame with a Series. - - >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) - >>> s = pd.Series([1, 1, 2, 1]) - >>> df.dot(s) - 0 -4 - 1 5 - dtype: int64 - - Here we multiply a DataFrame with another DataFrame. - - >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) - >>> df.dot(other) - 0 1 - 0 1 4 - 1 2 2 - - Note that the dot method give the same result as @ - - >>> df @ other - 0 1 - 0 1 4 - 1 2 2 - - The dot method works also if other is an np.array. - - >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) - >>> df.dot(arr) - 0 1 - 0 1 4 - 1 2 2 - - Note how shuffling of the objects does not change the result. - - >>> s2 = s.reindex([1, 0, 2, 3]) - >>> df.dot(s2) - 0 -4 - 1 5 - dtype: int64 - """ - if isinstance(other, (Series, DataFrame)): - common = self.columns.union(other.index) - if len(common) > len(self.columns) or len(common) > len(other.index): - raise ValueError("matrices are not aligned") - - left = self.reindex(columns=common, copy=False) - right = other.reindex(index=common, copy=False) - lvals = left.values - rvals = right._values - else: - left = self - lvals = self.values - rvals = np.asarray(other) - if lvals.shape[1] != rvals.shape[0]: - raise ValueError( - f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" - ) - - if isinstance(other, DataFrame): - return self._constructor( - np.dot(lvals, rvals), index=left.index, columns=other.columns - ) - elif isinstance(other, Series): - return self._constructor_sliced(np.dot(lvals, rvals), index=left.index) - elif isinstance(rvals, (np.ndarray, Index)): - result = np.dot(lvals, rvals) - if result.ndim == 2: - return self._constructor(result, index=left.index) - else: - return self._constructor_sliced(result, index=left.index) - else: # pragma: no cover - raise TypeError(f"unsupported type: {type(other)}") - - @overload - def __matmul__(self, other: Series) -> Series: - ... - - @overload - def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: - ... - - def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: - """ - Matrix multiplication using binary `@` operator in Python>=3.5. - """ - return self.dot(other) - - def __rmatmul__(self, other): - """ - Matrix multiplication using binary `@` operator in Python>=3.5. - """ - try: - return self.T.dot(np.transpose(other)).T - except ValueError as err: - if "shape mismatch" not in str(err): - raise - # GH#21581 give exception message for original shapes - msg = f"shapes {np.shape(other)} and {self.shape} not aligned" - raise ValueError(msg) from err - - # ---------------------------------------------------------------------- - # IO methods (to / from other formats) - - @classmethod - def from_dict( - cls, - data, - orient: str = "columns", - dtype: Dtype | None = None, - columns=None, - ) -> DataFrame: - """ - Construct DataFrame from dict of array-like or dicts. - - Creates DataFrame object from dictionary by columns or by index - allowing dtype specification. - - Parameters - ---------- - data : dict - Of the form {field : array-like} or {field : dict}. - orient : {'columns', 'index', 'tight'}, default 'columns' - The "orientation" of the data. If the keys of the passed dict - should be the columns of the resulting DataFrame, pass 'columns' - (default). Otherwise if the keys should be rows, pass 'index'. - If 'tight', assume a dict with keys ['index', 'columns', 'data', - 'index_names', 'column_names']. - - .. versionadded:: 1.4.0 - 'tight' as an allowed value for the ``orient`` argument - - dtype : dtype, default None - Data type to force, otherwise infer. - columns : list, default None - Column labels to use when ``orient='index'``. Raises a ValueError - if used with ``orient='columns'`` or ``orient='tight'``. - - Returns - ------- - DataFrame - - See Also - -------- - DataFrame.from_records : DataFrame from structured ndarray, sequence - of tuples or dicts, or DataFrame. - DataFrame : DataFrame object creation using constructor. - DataFrame.to_dict : Convert the DataFrame to a dictionary. - - Examples - -------- - By default the keys of the dict become the DataFrame columns: - - >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} - >>> pd.DataFrame.from_dict(data) - col_1 col_2 - 0 3 a - 1 2 b - 2 1 c - 3 0 d - - Specify ``orient='index'`` to create the DataFrame using dictionary - keys as rows: - - >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} - >>> pd.DataFrame.from_dict(data, orient='index') - 0 1 2 3 - row_1 3 2 1 0 - row_2 a b c d - - When using the 'index' orientation, the column names can be - specified manually: - - >>> pd.DataFrame.from_dict(data, orient='index', - ... columns=['A', 'B', 'C', 'D']) - A B C D - row_1 3 2 1 0 - row_2 a b c d - - Specify ``orient='tight'`` to create the DataFrame using a 'tight' - format: - - >>> data = {'index': [('a', 'b'), ('a', 'c')], - ... 'columns': [('x', 1), ('y', 2)], - ... 'data': [[1, 3], [2, 4]], - ... 'index_names': ['n1', 'n2'], - ... 'column_names': ['z1', 'z2']} - >>> pd.DataFrame.from_dict(data, orient='tight') - z1 x y - z2 1 2 - n1 n2 - a b 1 3 - c 2 4 - """ - index = None - orient = orient.lower() - if orient == "index": - if len(data) > 0: - # TODO speed up Series case - if isinstance(list(data.values())[0], (Series, dict)): - data = _from_nested_dict(data) - else: - data, index = list(data.values()), list(data.keys()) - elif orient == "columns" or orient == "tight": - if columns is not None: - raise ValueError(f"cannot use columns parameter with orient='{orient}'") - else: # pragma: no cover - raise ValueError("only recognize index or columns for orient") - - if orient != "tight": - return cls(data, index=index, columns=columns, dtype=dtype) - else: - realdata = data["data"] - - def create_index(indexlist, namelist): - index: Index - if len(namelist) > 1: - index = MultiIndex.from_tuples(indexlist, names=namelist) - else: - index = Index(indexlist, name=namelist[0]) - return index - - index = create_index(data["index"], data["index_names"]) - columns = create_index(data["columns"], data["column_names"]) - return cls(realdata, index=index, columns=columns, dtype=dtype) - - @validate_bool_kwargs_from_keywords('copy') - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert the DataFrame to a NumPy array. - - By default, the dtype of the returned array will be the common NumPy - dtype of all types in the DataFrame. For example, if the dtypes are - ``float16`` and ``float32``, the results dtype will be ``float32``. - This may require copying data and coercing values, which may be - expensive. - - Parameters - ---------- - dtype : str or numpy.dtype, optional - The dtype to pass to :meth:`numpy.asarray`. - copy : bool, default False - Whether to ensure that the returned value is not a view on - another array. Note that ``copy=False`` does not *ensure* that - ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that - a copy is made, even if not strictly necessary. - na_value : Any, optional - The value to use for missing values. The default value depends - on `dtype` and the dtypes of the DataFrame columns. - - .. versionadded:: 1.1.0 - - Returns - ------- - numpy.ndarray - - See Also - -------- - Series.to_numpy : Similar method for Series. - - Examples - -------- - >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() - array([[1, 3], - [2, 4]]) - - With heterogeneous data, the lowest common type will have to - be used. - - >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) - >>> df.to_numpy() - array([[1. , 3. ], - [2. , 4.5]]) - - For a mix of numeric and non-numeric types, the output array will - have object dtype. - - >>> df['C'] = pd.date_range('2000', periods=2) - >>> df.to_numpy() - array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], - [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) - """ - self._consolidate_inplace() - if dtype is not None: - dtype = np.dtype(dtype) - result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) - if result.dtype is not dtype: - result = np.array(result, dtype=dtype, copy=False) - - return result - - def to_dict(self, orient: str = "dict", into=dict): - """ - Convert the DataFrame to a dictionary. - - The type of the key-value pairs can be customized with the parameters - (see below). - - Parameters - ---------- - orient : str {'dict', 'list', 'series', 'split', 'records', 'index'} - Determines the type of the values of the dictionary. - - - 'dict' (default) : dict like {column -> {index -> value}} - - 'list' : dict like {column -> [values]} - - 'series' : dict like {column -> Series(values)} - - 'split' : dict like - {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} - - 'tight' : dict like - {'index' -> [index], 'columns' -> [columns], 'data' -> [values], - 'index_names' -> [index.names], 'column_names' -> [column.names]} - - 'records' : list like - [{column -> value}, ... , {column -> value}] - - 'index' : dict like {index -> {column -> value}} - - Abbreviations are allowed. `s` indicates `series` and `sp` - indicates `split`. - - .. versionadded:: 1.4.0 - 'tight' as an allowed value for the ``orient`` argument - - into : class, default dict - The collections.abc.Mapping subclass used for all Mappings - in the return value. Can be the actual class or an empty - instance of the mapping type you want. If you want a - collections.defaultdict, you must pass it initialized. - - Returns - ------- - dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. - - See Also - -------- - DataFrame.from_dict: Create a DataFrame from a dictionary. - DataFrame.to_json: Convert a DataFrame to JSON format. - - Examples - -------- - >>> df = pd.DataFrame({'col1': [1, 2], - ... 'col2': [0.5, 0.75]}, - ... index=['row1', 'row2']) - >>> df - col1 col2 - row1 1 0.50 - row2 2 0.75 - >>> df.to_dict() - {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} - - You can specify the return orientation. - - >>> df.to_dict('series') - {'col1': row1 1 - row2 2 - Name: col1, dtype: int64, - 'col2': row1 0.50 - row2 0.75 - Name: col2, dtype: float64} - - >>> df.to_dict('split') - {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], - 'data': [[1, 0.5], [2, 0.75]]} - - >>> df.to_dict('records') - [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] - - >>> df.to_dict('index') - {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} - - >>> df.to_dict('tight') - {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], - 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} - - You can also specify the mapping type. - - >>> from collections import OrderedDict, defaultdict - >>> df.to_dict(into=OrderedDict) - OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), - ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) - - If you want a `defaultdict`, you need to initialize it: - - >>> dd = defaultdict(list) - >>> df.to_dict('records', into=dd) - [defaultdict(, {'col1': 1, 'col2': 0.5}), - defaultdict(, {'col1': 2, 'col2': 0.75})] - """ - if not self.columns.is_unique: - warnings.warn( - "DataFrame columns are not unique, some columns will be omitted.", - UserWarning, - stacklevel=find_stack_level(), - ) - # GH16122 - into_c = com.standardize_mapping(into) - - orient = orient.lower() - # GH32515 - if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { - "dict", - "list", - "series", - "split", - "records", - "index", - }: - warnings.warn( - "Using short name for 'orient' is deprecated. Only the " - "options: ('dict', list, 'series', 'split', 'records', 'index') " - "will be used in a future version. Use one of the above " - "to silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if orient.startswith("d"): - orient = "dict" - elif orient.startswith("l"): - orient = "list" - elif orient.startswith("sp"): - orient = "split" - elif orient.startswith("s"): - orient = "series" - elif orient.startswith("r"): - orient = "records" - elif orient.startswith("i"): - orient = "index" - - if orient == "dict": - return into_c((k, v.to_dict(into)) for k, v in self.items()) - - elif orient == "list": - return into_c( - (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items() - ) - - elif orient == "split": - return into_c( - ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ( - "data", - [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ], - ), - ) - ) - - elif orient == "tight": - return into_c( - ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ( - "data", - [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ], - ), - ("index_names", list(self.index.names)), - ("column_names", list(self.columns.names)), - ) - ) - - elif orient == "series": - return into_c((k, v) for k, v in self.items()) - - elif orient == "records": - columns = self.columns.tolist() - rows = ( - dict(zip(columns, row)) - for row in self.itertuples(index=False, name=None) - ) - return [ - into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows - ] - - elif orient == "index": - if not self.index.is_unique: - raise ValueError("DataFrame index must be unique for orient='index'.") - return into_c( - (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) - for t in self.itertuples(name=None) - ) - - else: - raise ValueError(f"orient '{orient}' not understood") - - @validate_bool_kwargs_from_keywords('reauth', 'auth_local_webserver', 'progress_bar') - def to_gbq( - self, - destination_table: str, - project_id: str | None = None, - chunksize: int | None = None, - reauth: bool = False, - if_exists: str = "fail", - auth_local_webserver: bool = True, - table_schema: list[dict[str, str]] | None = None, - location: str | None = None, - progress_bar: bool = True, - credentials=None, - ) -> None: - """ - Write a DataFrame to a Google BigQuery table. - - This function requires the `pandas-gbq package - `__. - - See the `How to authenticate with Google BigQuery - `__ - guide for authentication instructions. - - Parameters - ---------- - destination_table : str - Name of table to be written, in the form ``dataset.tablename``. - project_id : str, optional - Google BigQuery Account project ID. Optional when available from - the environment. - chunksize : int, optional - Number of rows to be inserted in each chunk from the dataframe. - Set to ``None`` to load the whole dataframe at once. - reauth : bool, default False - Force Google BigQuery to re-authenticate the user. This is useful - if multiple accounts are used. - if_exists : str, default 'fail' - Behavior when the destination table exists. Value can be one of: - - ``'fail'`` - If table exists raise pandas_gbq.gbq.TableCreationError. - ``'replace'`` - If table exists, drop it, recreate it, and insert data. - ``'append'`` - If table exists, insert data. Create if does not exist. - auth_local_webserver : bool, default True - Use the `local webserver flow`_ instead of the `console flow`_ - when getting user credentials. - - .. _local webserver flow: - https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server - .. _console flow: - https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console - - *New in version 0.2.0 of pandas-gbq*. - - .. versionchanged:: 1.5.0 - Default value is changed to ``True``. Google has deprecated the - ``auth_local_webserver = False`` `"out of band" (copy-paste) - flow - `_. - table_schema : list of dicts, optional - List of BigQuery table fields to which according DataFrame - columns conform to, e.g. ``[{'name': 'col1', 'type': - 'STRING'},...]``. If schema is not provided, it will be - generated according to dtypes of DataFrame columns. See - BigQuery API documentation on available names of a field. - - *New in version 0.3.1 of pandas-gbq*. - location : str, optional - Location where the load job should run. See the `BigQuery locations - documentation - `__ for a - list of available locations. The location must match that of the - target dataset. - - *New in version 0.5.0 of pandas-gbq*. - progress_bar : bool, default True - Use the library `tqdm` to show the progress bar for the upload, - chunk by chunk. - - *New in version 0.5.0 of pandas-gbq*. - credentials : google.auth.credentials.Credentials, optional - Credentials for accessing Google APIs. Use this parameter to - override default credentials, such as to use Compute Engine - :class:`google.auth.compute_engine.Credentials` or Service - Account :class:`google.oauth2.service_account.Credentials` - directly. - - *New in version 0.8.0 of pandas-gbq*. - - See Also - -------- - pandas_gbq.to_gbq : This function in the pandas-gbq library. - read_gbq : Read a DataFrame from Google BigQuery. - """ - from pandas.io import gbq - - gbq.to_gbq( - self, - destination_table, - project_id=project_id, - chunksize=chunksize, - reauth=reauth, - if_exists=if_exists, - auth_local_webserver=auth_local_webserver, - table_schema=table_schema, - location=location, - progress_bar=progress_bar, - credentials=credentials, - ) - - @classmethod - def from_records( - cls, - data, - index=None, - exclude=None, - columns=None, - coerce_float: bool = False, - nrows: int | None = None, - ) -> DataFrame: - """ - Convert structured or record ndarray to DataFrame. - - Creates a DataFrame object from a structured ndarray, sequence of - tuples or dicts, or DataFrame. - - Parameters - ---------- - data : structured ndarray, sequence of tuples or dicts, or DataFrame - Structured input data. - index : str, list of fields, array-like - Field of array to use as the index, alternately a specific set of - input labels to use. - exclude : sequence, default None - Columns or fields to exclude. - columns : sequence, default None - Column names to use. If the passed data do not have names - associated with them, this argument provides names for the - columns. Otherwise this argument indicates the order of the columns - in the result (any names not found in the data will become all-NA - columns). - coerce_float : bool, default False - Attempt to convert values of non-string, non-numeric objects (like - decimal.Decimal) to floating point, useful for SQL result sets. - nrows : int, default None - Number of rows to read if data is an iterator. - - Returns - ------- - DataFrame - - See Also - -------- - DataFrame.from_dict : DataFrame from dict of array-like or dicts. - DataFrame : DataFrame object creation using constructor. - - Examples - -------- - Data can be provided as a structured ndarray: - - >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], - ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) - >>> pd.DataFrame.from_records(data) - col_1 col_2 - 0 3 a - 1 2 b - 2 1 c - 3 0 d - - Data can be provided as a list of dicts: - - >>> data = [{'col_1': 3, 'col_2': 'a'}, - ... {'col_1': 2, 'col_2': 'b'}, - ... {'col_1': 1, 'col_2': 'c'}, - ... {'col_1': 0, 'col_2': 'd'}] - >>> pd.DataFrame.from_records(data) - col_1 col_2 - 0 3 a - 1 2 b - 2 1 c - 3 0 d - - Data can be provided as a list of tuples with corresponding columns: - - >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] - >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) - col_1 col_2 - 0 3 a - 1 2 b - 2 1 c - 3 0 d - """ - result_index = None - - # Make a copy of the input columns so we can modify it - if columns is not None: - columns = ensure_index(columns) - - def maybe_reorder( - arrays: list[ArrayLike], arr_columns: Index, columns: Index, index - ) -> tuple[list[ArrayLike], Index, Index | None]: - """ - If our desired 'columns' do not match the data's pre-existing 'arr_columns', - we re-order our arrays. This is like a pre-emptive (cheap) reindex. - """ - if len(arrays): - length = len(arrays[0]) - else: - length = 0 - - result_index = None - if len(arrays) == 0 and index is None and length == 0: - # for backward compat use an object Index instead of RangeIndex - result_index = Index([]) - - arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) - return arrays, arr_columns, result_index - - if is_iterator(data): - if nrows == 0: - return cls() - - try: - first_row = next(data) - except StopIteration: - return cls(index=index, columns=columns) - - dtype = None - if hasattr(first_row, "dtype") and first_row.dtype.names: - dtype = first_row.dtype - - values = [first_row] - - if nrows is None: - values += data - else: - values.extend(itertools.islice(data, nrows - 1)) - - if dtype is not None: - data = np.array(values, dtype=dtype) - else: - data = values - - if isinstance(data, dict): - if columns is None: - columns = arr_columns = ensure_index(sorted(data)) - arrays = [data[k] for k in columns] - else: - arrays = [] - arr_columns_list = [] - for k, v in data.items(): - if k in columns: - arr_columns_list.append(k) - arrays.append(v) - - arr_columns = Index(arr_columns_list) - arrays, arr_columns, result_index = maybe_reorder( - arrays, arr_columns, columns, index - ) - - elif isinstance(data, (np.ndarray, DataFrame)): - arrays, columns = to_arrays(data, columns) - arr_columns = columns - else: - arrays, arr_columns = to_arrays(data, columns) - if coerce_float: - for i, arr in enumerate(arrays): - if arr.dtype == object: - # error: Argument 1 to "maybe_convert_objects" has - # incompatible type "Union[ExtensionArray, ndarray]"; - # expected "ndarray" - arrays[i] = lib.maybe_convert_objects( - arr, # type: ignore[arg-type] - try_float=True, - ) - - arr_columns = ensure_index(arr_columns) - if columns is None: - columns = arr_columns - else: - arrays, arr_columns, result_index = maybe_reorder( - arrays, arr_columns, columns, index - ) - - if exclude is None: - exclude = set() - else: - exclude = set(exclude) - - if index is not None: - if isinstance(index, str) or not hasattr(index, "__iter__"): - i = columns.get_loc(index) - exclude.add(index) - if len(arrays) > 0: - result_index = Index(arrays[i], name=index) - else: - result_index = Index([], name=index) - else: - try: - index_data = [arrays[arr_columns.get_loc(field)] for field in index] - except (KeyError, TypeError): - # raised by get_loc, see GH#29258 - result_index = index - else: - result_index = ensure_index_from_sequences(index_data, names=index) - exclude.update(index) - - if any(exclude): - arr_exclude = [x for x in exclude if x in arr_columns] - to_remove = [arr_columns.get_loc(col) for col in arr_exclude] - arrays = [v for i, v in enumerate(arrays) if i not in to_remove] - - columns = columns.drop(exclude) - - manager = get_option("mode.data_manager") - mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) - - return cls(mgr) - - def to_records( - self, index=True, column_dtypes=None, index_dtypes=None - ) -> np.recarray: - """ - Convert DataFrame to a NumPy record array. - - Index will be included as the first field of the record array if - requested. - - Parameters - ---------- - index : bool, default True - Include index in resulting record array, stored in 'index' - field or using the index label, if set. - column_dtypes : str, type, dict, default None - If a string or type, the data type to store all columns. If - a dictionary, a mapping of column names and indices (zero-indexed) - to specific data types. - index_dtypes : str, type, dict, default None - If a string or type, the data type to store all index levels. If - a dictionary, a mapping of index level names and indices - (zero-indexed) to specific data types. - - This mapping is applied only if `index=True`. - - Returns - ------- - numpy.recarray - NumPy ndarray with the DataFrame labels as fields and each row - of the DataFrame as entries. - - See Also - -------- - DataFrame.from_records: Convert structured or record ndarray - to DataFrame. - numpy.recarray: An ndarray that allows field access using - attributes, analogous to typed columns in a - spreadsheet. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, - ... index=['a', 'b']) - >>> df - A B - a 1 0.50 - b 2 0.75 - >>> df.to_records() - rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], - dtype=[('index', 'O'), ('A', '>> df.index = df.index.rename("I") - >>> df.to_records() - rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], - dtype=[('I', 'O'), ('A', '>> df.to_records(index=False) - rec.array([(1, 0.5 ), (2, 0.75)], - dtype=[('A', '>> df.to_records(column_dtypes={"A": "int32"}) - rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], - dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=">> index_dtypes = f">> df.to_records(index_dtypes=index_dtypes) - rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], - dtype=[('I', 'S1'), ('A', ' DataFrame: - """ - Create DataFrame from a list of arrays corresponding to the columns. - - Parameters - ---------- - arrays : list-like of arrays - Each array in the list corresponds to one column, in order. - columns : list-like, Index - The column names for the resulting DataFrame. - index : list-like, Index - The rows labels for the resulting DataFrame. - dtype : dtype, optional - Optional dtype to enforce for all arrays. - verify_integrity : bool, default True - Validate and homogenize all input. If set to False, it is assumed - that all elements of `arrays` are actual arrays how they will be - stored in a block (numpy ndarray or ExtensionArray), have the same - length as and are aligned with the index, and that `columns` and - `index` are ensured to be an Index object. - - Returns - ------- - DataFrame - """ - if dtype is not None: - dtype = pandas_dtype(dtype) - - manager = get_option("mode.data_manager") - columns = ensure_index(columns) - if len(columns) != len(arrays): - raise ValueError("len(columns) must match len(arrays)") - mgr = arrays_to_mgr( - arrays, - columns, - index, - dtype=dtype, - verify_integrity=verify_integrity, - typ=manager, - ) - return cls(mgr) - - @doc( - storage_options=_shared_docs["storage_options"], - compression_options=_shared_docs["compression_options"] % "path", - ) - @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - @validate_bool_kwargs_from_keywords('write_index') - def to_stata( - self, - path: FilePath | WriteBuffer[bytes], - convert_dates: dict[Hashable, str] | None = None, - write_index: bool = True, - byteorder: str | None = None, - time_stamp: datetime.datetime | None = None, - data_label: str | None = None, - variable_labels: dict[Hashable, str] | None = None, - version: int | None = 114, - convert_strl: Sequence[Hashable] | None = None, - compression: CompressionOptions = "infer", - storage_options: StorageOptions = None, - *, - value_labels: dict[Hashable, dict[float | int, str]] | None = None, - ) -> None: - """ - Export DataFrame object to Stata dta format. - - Writes the DataFrame to a Stata dataset file. - "dta" files contain a Stata dataset. - - Parameters - ---------- - path : str, path object, or buffer - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a binary ``write()`` function. - - .. versionchanged:: 1.0.0 - - Previously this was "fname" - - convert_dates : dict - Dictionary mapping columns containing datetime types to stata - internal format to use when writing the dates. Options are 'tc', - 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer - or a name. Datetime columns that do not have a conversion type - specified will be converted to 'tc'. Raises NotImplementedError if - a datetime column has timezone information. - write_index : bool - Write the index to Stata dataset. - byteorder : str - Can be ">", "<", "little", or "big". default is `sys.byteorder`. - time_stamp : datetime - A datetime to use as file creation date. Default is the current - time. - data_label : str, optional - A label for the data set. Must be 80 characters or smaller. - variable_labels : dict - Dictionary containing columns as keys and variable labels as - values. Each label must be 80 characters or smaller. - version : {{114, 117, 118, 119, None}}, default 114 - Version to use in the output dta file. Set to None to let pandas - decide between 118 or 119 formats depending on the number of - columns in the frame. Version 114 can be read by Stata 10 and - later. Version 117 can be read by Stata 13 or later. Version 118 - is supported in Stata 14 and later. Version 119 is supported in - Stata 15 and later. Version 114 limits string variables to 244 - characters or fewer while versions 117 and later allow strings - with lengths up to 2,000,000 characters. Versions 118 and 119 - support Unicode characters, and version 119 supports more than - 32,767 variables. - - Version 119 should usually only be used when the number of - variables exceeds the capacity of dta format 118. Exporting - smaller datasets in format 119 may have unintended consequences, - and, as of November 2020, Stata SE cannot read version 119 files. - - .. versionchanged:: 1.0.0 - - Added support for formats 118 and 119. - - convert_strl : list, optional - List of column names to convert to string columns to Stata StrL - format. Only available if version is 117. Storing strings in the - StrL format can produce smaller dta files if strings have more than - 8 characters and values are repeated. - {compression_options} - - .. versionadded:: 1.1.0 - - .. versionchanged:: 1.4.0 Zstandard support. - - {storage_options} - - .. versionadded:: 1.2.0 - - value_labels : dict of dicts - Dictionary containing columns as keys and dictionaries of column value - to labels as values. Labels for a single variable must be 32,000 - characters or smaller. - - .. versionadded:: 1.4.0 - - Raises - ------ - NotImplementedError - * If datetimes contain timezone information - * Column dtype is not representable in Stata - ValueError - * Columns listed in convert_dates are neither datetime64[ns] - or datetime.datetime - * Column listed in convert_dates is not in DataFrame - * Categorical label contains more than 32,000 characters - - See Also - -------- - read_stata : Import Stata data files. - io.stata.StataWriter : Low-level writer for Stata data files. - io.stata.StataWriter117 : Low-level writer for version 117 files. - - Examples - -------- - >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', - ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}}) - >>> df.to_stata('animals.dta') # doctest: +SKIP - """ - if version not in (114, 117, 118, 119, None): - raise ValueError("Only formats 114, 117, 118 and 119 are supported.") - if version == 114: - if convert_strl is not None: - raise ValueError("strl is not supported in format 114") - from pandas.io.stata import StataWriter as statawriter - elif version == 117: - # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import ( # type: ignore[no-redef] - StataWriter117 as statawriter, - ) - else: # versions 118 and 119 - # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import ( # type: ignore[no-redef] - StataWriterUTF8 as statawriter, - ) - - kwargs: dict[str, Any] = {} - if version is None or version >= 117: - # strl conversion is only supported >= 117 - kwargs["convert_strl"] = convert_strl - if version is None or version >= 118: - # Specifying the version is only supported for UTF8 (118 or 119) - kwargs["version"] = version - - writer = statawriter( - path, - self, - convert_dates=convert_dates, - byteorder=byteorder, - time_stamp=time_stamp, - data_label=data_label, - write_index=write_index, - variable_labels=variable_labels, - compression=compression, - storage_options=storage_options, - value_labels=value_labels, - **kwargs, - ) - writer.write_file() - - @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: - """ - Write a DataFrame to the binary Feather format. - - Parameters - ---------- - path : str, path object, file-like object - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a binary ``write()`` function. If a string or a path, - it will be used as Root Directory path when writing a partitioned dataset. - **kwargs : - Additional keywords passed to :func:`pyarrow.feather.write_feather`. - Starting with pyarrow 0.17, this includes the `compression`, - `compression_level`, `chunksize` and `version` keywords. - - .. versionadded:: 1.1.0 - - Notes - ----- - This function writes the dataframe as a `feather file - `_. Requires a default - index. For saving the DataFrame with your custom index use a method that - supports custom indices e.g. `to_parquet`. - """ - from pandas.io.feather_format import to_feather - - to_feather(self, path, **kwargs) - - @doc( - Series.to_markdown, - klass=_shared_doc_kwargs["klass"], - storage_options=_shared_docs["storage_options"], - examples="""Examples - -------- - >>> df = pd.DataFrame( - ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} - ... ) - >>> print(df.to_markdown()) - | | animal_1 | animal_2 | - |---:|:-----------|:-----------| - | 0 | elk | dog | - | 1 | pig | quetzal | - - Output markdown with a tabulate option. - - >>> print(df.to_markdown(tablefmt="grid")) - +----+------------+------------+ - | | animal_1 | animal_2 | - +====+============+============+ - | 0 | elk | dog | - +----+------------+------------+ - | 1 | pig | quetzal | - +----+------------+------------+""", - ) - def to_markdown( - self, - buf: FilePath | WriteBuffer[str] | None = None, - mode: str = "wt", - index: bool = True, - storage_options: StorageOptions = None, - **kwargs, - ) -> str | None: - if "showindex" in kwargs: - warnings.warn( - "'showindex' is deprecated. Only 'index' will be used " - "in a future version. Use 'index' to silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - kwargs.setdefault("headers", "keys") - kwargs.setdefault("tablefmt", "pipe") - kwargs.setdefault("showindex", index) - tabulate = import_optional_dependency("tabulate") - result = tabulate.tabulate(self, **kwargs) - if buf is None: - return result - - with get_handle(buf, mode, storage_options=storage_options) as handles: - handles.handle.write(result) - return None - - @doc(storage_options=_shared_docs["storage_options"]) - @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_parquet( - self, - path: FilePath | WriteBuffer[bytes] | None = None, - engine: str = "auto", - compression: str | None = "snappy", - index: bool | None = None, - partition_cols: list[str] | None = None, - storage_options: StorageOptions = None, - **kwargs, - ) -> bytes | None: - """ - Write a DataFrame to the binary parquet format. - - This function writes the dataframe as a `parquet file - `_. You can choose different parquet - backends, and have the option of compression. See - :ref:`the user guide ` for more details. - - Parameters - ---------- - path : str, path object, file-like object, or None, default None - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a binary ``write()`` function. If None, the result is - returned as bytes. If a string or path, it will be used as Root Directory - path when writing a partitioned dataset. - - .. versionchanged:: 1.2.0 - - Previously this was "fname" - - engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' - Parquet library to use. If 'auto', then the option - ``io.parquet.engine`` is used. The default ``io.parquet.engine`` - behavior is to try 'pyarrow', falling back to 'fastparquet' if - 'pyarrow' is unavailable. - compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' - Name of the compression to use. Use ``None`` for no compression. - index : bool, default None - If ``True``, include the dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. - If ``None``, similar to ``True`` the dataframe's index(es) - will be saved. However, instead of being saved as values, - the RangeIndex will be stored as a range in the metadata so it - doesn't require much space and is faster. Other indexes will - be included as columns in the file output. - partition_cols : list, optional, default None - Column names by which to partition the dataset. - Columns are partitioned in the order they are given. - Must be None if path is not a string. - {storage_options} - - .. versionadded:: 1.2.0 - - **kwargs - Additional arguments passed to the parquet library. See - :ref:`pandas io ` for more details. - - Returns - ------- - bytes if no path argument is provided else None - - See Also - -------- - read_parquet : Read a parquet file. - DataFrame.to_csv : Write a csv file. - DataFrame.to_sql : Write to a sql table. - DataFrame.to_hdf : Write to hdf. - - Notes - ----- - This function requires either the `fastparquet - `_ or `pyarrow - `_ library. - - Examples - -------- - >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) - >>> df.to_parquet('df.parquet.gzip', - ... compression='gzip') # doctest: +SKIP - >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP - col1 col2 - 0 1 3 - 1 2 4 - - If you want to get a buffer to the parquet content you can use a io.BytesIO - object, as long as you don't use partition_cols, which creates multiple files. - - >>> import io - >>> f = io.BytesIO() - >>> df.to_parquet(f) - >>> f.seek(0) - 0 - >>> content = f.read() - """ - from pandas.io.parquet import to_parquet - - return to_parquet( - self, - path, - engine, - compression=compression, - index=index, - partition_cols=partition_cols, - storage_options=storage_options, - **kwargs, - ) - - @Substitution( - header_type="bool", - header="Whether to print column labels, default True", - col_space_type="str or int, list or dict of int or str", - col_space="The minimum width of each column in CSS length " - "units. An int is assumed to be px units.\n\n" - " .. versionadded:: 0.25.0\n" - " Ability to use str", - ) - @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) - @validate_bool_kwargs_from_keywords('index', 'index_names', 'bold_rows', 'escape', 'notebook', 'render_links') - def to_html( - self, - buf: FilePath | WriteBuffer[str] | None = None, - columns: Sequence[str] | None = None, - col_space: ColspaceArgType | None = None, - header: bool | Sequence[str] = True, - index: bool = True, - na_rep: str = "NaN", - formatters: FormattersType | None = None, - float_format: FloatFormatType | None = None, - sparsify: bool | None = None, - index_names: bool = True, - justify: str | None = None, - max_rows: int | None = None, - max_cols: int | None = None, - show_dimensions: bool | str = False, - decimal: str = ".", - bold_rows: bool = True, - classes: str | list | tuple | None = None, - escape: bool = True, - notebook: bool = False, - border: int | bool | None = None, - table_id: str | None = None, - render_links: bool = False, - encoding: str | None = None, - ): - """ - Render a DataFrame as an HTML table. - %(shared_params)s - bold_rows : bool, default True - Make the row labels bold in the output. - classes : str or list or tuple, default None - CSS class(es) to apply to the resulting html table. - escape : bool, default True - Convert the characters <, >, and & to HTML-safe sequences. - notebook : {True, False}, default False - Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - `
` tag. Default ``pd.options.display.html.border``. - table_id : str, optional - A css id is included in the opening `
` tag if specified. - render_links : bool, default False - Convert URLs to HTML links. - encoding : str, default "utf-8" - Set character encoding. - - .. versionadded:: 1.0 - %(returns)s - See Also - -------- - to_string : Convert DataFrame to a string. - """ - if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: - raise ValueError("Invalid value for justify parameter") - - formatter = fmt.DataFrameFormatter( - self, - columns=columns, - col_space=col_space, - na_rep=na_rep, - header=header, - index=index, - formatters=formatters, - float_format=float_format, - bold_rows=bold_rows, - sparsify=sparsify, - justify=justify, - index_names=index_names, - escape=escape, - decimal=decimal, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - ) - # TODO: a generic formatter wld b in DataFrameFormatter - return fmt.DataFrameRenderer(formatter).to_html( - buf=buf, - classes=classes, - notebook=notebook, - border=border, - encoding=encoding, - table_id=table_id, - render_links=render_links, - ) - - @doc( - storage_options=_shared_docs["storage_options"], - compression_options=_shared_docs["compression_options"] % "path_or_buffer", - ) - @validate_bool_kwargs_from_keywords('index') - def to_xml( - self, - path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, - index: bool = True, - root_name: str | None = "data", - row_name: str | None = "row", - na_rep: str | None = None, - attr_cols: list[str] | None = None, - elem_cols: list[str] | None = None, - namespaces: dict[str | None, str] | None = None, - prefix: str | None = None, - encoding: str = "utf-8", - xml_declaration: bool | None = True, - pretty_print: bool | None = True, - parser: str | None = "lxml", - stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, - compression: CompressionOptions = "infer", - storage_options: StorageOptions = None, - ) -> str | None: - """ - Render a DataFrame to an XML document. - - .. versionadded:: 1.3.0 - - Parameters - ---------- - path_or_buffer : str, path object, file-like object, or None, default None - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a ``write()`` function. If None, the result is returned - as a string. - index : bool, default True - Whether to include index in XML document. - root_name : str, default 'data' - The name of root element in XML document. - row_name : str, default 'row' - The name of row element in XML document. - na_rep : str, optional - Missing data representation. - attr_cols : list-like, optional - List of columns to write as attributes in row element. - Hierarchical columns will be flattened with underscore - delimiting the different levels. - elem_cols : list-like, optional - List of columns to write as children in row element. By default, - all columns output as children of row element. Hierarchical - columns will be flattened with underscore delimiting the - different levels. - namespaces : dict, optional - All namespaces to be defined in root element. Keys of dict - should be prefix names and values of dict corresponding URIs. - Default namespaces should be given empty string key. For - example, :: - - namespaces = {{"": "https://example.com"}} - - prefix : str, optional - Namespace prefix to be used for every element and/or attribute - in document. This should be one of the keys in ``namespaces`` - dict. - encoding : str, default 'utf-8' - Encoding of the resulting document. - xml_declaration : bool, default True - Whether to include the XML declaration at start of document. - pretty_print : bool, default True - Whether output should be pretty printed with indentation and - line breaks. - parser : {{'lxml','etree'}}, default 'lxml' - Parser module to use for building of tree. Only 'lxml' and - 'etree' are supported. With 'lxml', the ability to use XSLT - stylesheet is supported. - stylesheet : str, path object or file-like object, optional - A URL, file-like object, or a raw string containing an XSLT - script used to transform the raw XML output. Script should use - layout of elements and attributes from original output. This - argument requires ``lxml`` to be installed. Only XSLT 1.0 - scripts and not later versions is currently supported. - {compression_options} - - .. versionchanged:: 1.4.0 Zstandard support. - - {storage_options} - - Returns - ------- - None or str - If ``io`` is None, returns the resulting XML format as a - string. Otherwise returns None. - - See Also - -------- - to_json : Convert the pandas object to a JSON string. - to_html : Convert DataFrame to a html. - - Examples - -------- - >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], - ... 'degrees': [360, 360, 180], - ... 'sides': [4, np.nan, 3]}}) - - >>> df.to_xml() # doctest: +SKIP - - - - 0 - square - 360 - 4.0 - - - 1 - circle - 360 - - - - 2 - triangle - 180 - 3.0 - - - - >>> df.to_xml(attr_cols=[ - ... 'index', 'shape', 'degrees', 'sides' - ... ]) # doctest: +SKIP - - - - - - - - >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, - ... prefix="doc") # doctest: +SKIP - - - - 0 - square - 360 - 4.0 - - - 1 - circle - 360 - - - - 2 - triangle - 180 - 3.0 - - - """ - - from pandas.io.formats.xml import ( - EtreeXMLFormatter, - LxmlXMLFormatter, - ) - - lxml = import_optional_dependency("lxml.etree", errors="ignore") - - TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter] - - if parser == "lxml": - if lxml is not None: - TreeBuilder = LxmlXMLFormatter - else: - raise ImportError( - "lxml not found, please install or use the etree parser." - ) - - elif parser == "etree": - TreeBuilder = EtreeXMLFormatter - - else: - raise ValueError("Values for parser can only be lxml or etree.") - - xml_formatter = TreeBuilder( - self, - path_or_buffer=path_or_buffer, - index=index, - root_name=root_name, - row_name=row_name, - na_rep=na_rep, - attr_cols=attr_cols, - elem_cols=elem_cols, - namespaces=namespaces, - prefix=prefix, - encoding=encoding, - xml_declaration=xml_declaration, - pretty_print=pretty_print, - stylesheet=stylesheet, - compression=compression, - storage_options=storage_options, - ) - - return xml_formatter.write_output() - - # ---------------------------------------------------------------------- - @doc(INFO_DOCSTRING, **frame_sub_kwargs) - def info( - self, - verbose: bool | None = None, - buf: WriteBuffer[str] | None = None, - max_cols: int | None = None, - memory_usage: bool | str | None = None, - show_counts: bool | None = None, - null_counts: bool | None = None, - ) -> None: - if null_counts is not None: - if show_counts is not None: - raise ValueError("null_counts used with show_counts. Use show_counts.") - warnings.warn( - "null_counts is deprecated. Use show_counts instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - show_counts = null_counts - info = DataFrameInfo( - data=self, - memory_usage=memory_usage, - ) - info.render( - buf=buf, - max_cols=max_cols, - verbose=verbose, - show_counts=show_counts, - ) - - @validate_bool_kwargs_from_keywords('bool', 'deep') - def memory_usage(self, index: bool = True, deep: bool = False) -> Series: - """ - Return the memory usage of each column in bytes. - - The memory usage can optionally include the contribution of - the index and elements of `object` dtype. - - This value is displayed in `DataFrame.info` by default. This can be - suppressed by setting ``pandas.options.display.memory_usage`` to False. - - Parameters - ---------- - index : bool, default True - Specifies whether to include the memory usage of the DataFrame's - index in returned Series. If ``index=True``, the memory usage of - the index is the first item in the output. - deep : bool, default False - If True, introspect the data deeply by interrogating - `object` dtypes for system-level memory consumption, and include - it in the returned values. - - Returns - ------- - Series - A Series whose index is the original column names and whose values - is the memory usage of each column in bytes. - - See Also - -------- - numpy.ndarray.nbytes : Total bytes consumed by the elements of an - ndarray. - Series.memory_usage : Bytes consumed by a Series. - Categorical : Memory-efficient array for string values with - many repeated values. - DataFrame.info : Concise summary of a DataFrame. - - Notes - ----- - See the :ref:`Frequently Asked Questions ` for more - details. - - Examples - -------- - >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) - ... for t in dtypes]) - >>> df = pd.DataFrame(data) - >>> df.head() - int64 float64 complex128 object bool - 0 1 1.0 1.0+0.0j 1 True - 1 1 1.0 1.0+0.0j 1 True - 2 1 1.0 1.0+0.0j 1 True - 3 1 1.0 1.0+0.0j 1 True - 4 1 1.0 1.0+0.0j 1 True - - >>> df.memory_usage() - Index 128 - int64 40000 - float64 40000 - complex128 80000 - object 40000 - bool 5000 - dtype: int64 - - >>> df.memory_usage(index=False) - int64 40000 - float64 40000 - complex128 80000 - object 40000 - bool 5000 - dtype: int64 - - The memory footprint of `object` dtype columns is ignored by default: - - >>> df.memory_usage(deep=True) - Index 128 - int64 40000 - float64 40000 - complex128 80000 - object 180000 - bool 5000 - dtype: int64 - - Use a Categorical for efficient storage of an object-dtype column with - many repeated values. - - >>> df['object'].astype('category').memory_usage(deep=True) - 5244 - """ - result = self._constructor_sliced( - [c.memory_usage(index=False, deep=deep) for col, c in self.items()], - index=self.columns, - ) - if index: - index_memory_usage = self._constructor_sliced( - self.index.memory_usage(deep=deep), index=["Index"] - ) - result = index_memory_usage._append(result) - return result - - @validate_bool_kwargs_from_keywords('copy') - def transpose(self, *args, copy: bool = False) -> DataFrame: - """ - Transpose index and columns. - - Reflect the DataFrame over its main diagonal by writing rows as columns - and vice-versa. The property :attr:`.T` is an accessor to the method - :meth:`transpose`. - - Parameters - ---------- - *args : tuple, optional - Accepted for compatibility with NumPy. - copy : bool, default False - Whether to copy the data after transposing, even for DataFrames - with a single dtype. - - Note that a copy is always required for mixed dtype DataFrames, - or for DataFrames with any extension types. - - Returns - ------- - DataFrame - The transposed DataFrame. - - See Also - -------- - numpy.transpose : Permute the dimensions of a given array. - - Notes - ----- - Transposing a DataFrame with mixed dtypes will result in a homogeneous - DataFrame with the `object` dtype. In such a case, a copy of the data - is always made. - - Examples - -------- - **Square DataFrame with homogeneous dtype** - - >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} - >>> df1 = pd.DataFrame(data=d1) - >>> df1 - col1 col2 - 0 1 3 - 1 2 4 - - >>> df1_transposed = df1.T # or df1.transpose() - >>> df1_transposed - 0 1 - col1 1 2 - col2 3 4 - - When the dtype is homogeneous in the original DataFrame, we get a - transposed DataFrame with the same dtype: - - >>> df1.dtypes - col1 int64 - col2 int64 - dtype: object - >>> df1_transposed.dtypes - 0 int64 - 1 int64 - dtype: object - - **Non-square DataFrame with mixed dtypes** - - >>> d2 = {'name': ['Alice', 'Bob'], - ... 'score': [9.5, 8], - ... 'employed': [False, True], - ... 'kids': [0, 0]} - >>> df2 = pd.DataFrame(data=d2) - >>> df2 - name score employed kids - 0 Alice 9.5 False 0 - 1 Bob 8.0 True 0 - - >>> df2_transposed = df2.T # or df2.transpose() - >>> df2_transposed - 0 1 - name Alice Bob - score 9.5 8.0 - employed False True - kids 0 0 - - When the DataFrame has mixed dtypes, we get a transposed DataFrame with - the `object` dtype: - - >>> df2.dtypes - name object - score float64 - employed bool - kids int64 - dtype: object - >>> df2_transposed.dtypes - 0 object - 1 object - dtype: object - """ - nv.validate_transpose(args, {}) - # construct the args - - dtypes = list(self.dtypes) - - if self._can_fast_transpose: - # Note: tests pass without this, but this improves perf quite a bit. - new_vals = self._values.T - if copy: - new_vals = new_vals.copy() - - result = self._constructor(new_vals, index=self.columns, columns=self.index) - - elif ( - self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]) - ): - # We have EAs with the same dtype. We can preserve that dtype in transpose. - dtype = dtypes[0] - arr_type = dtype.construct_array_type() - values = self.values - - new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] - result = type(self)._from_arrays( - new_values, index=self.columns, columns=self.index - ) - - else: - new_arr = self.values.T - if copy: - new_arr = new_arr.copy() - result = self._constructor(new_arr, index=self.columns, columns=self.index) - - return result.__finalize__(self, method="transpose") - - @property - def T(self) -> DataFrame: - return self.transpose() - - # ---------------------------------------------------------------------- - # Indexing Methods - - def _ixs(self, i: int, axis: int = 0): - """ - Parameters - ---------- - i : int - axis : int - - Notes - ----- - If slice passed, the resulting data will be a view. - """ - # irow - if axis == 0: - new_values = self._mgr.fast_xs(i) - - # if we are a copy, mark as such - copy = isinstance(new_values, np.ndarray) and new_values.base is None - result = self._constructor_sliced( - new_values, - index=self.columns, - name=self.index[i], - dtype=new_values.dtype, - ).__finalize__(self) - result._set_is_copy(self, copy=copy) - return result - - # icol - else: - label = self.columns[i] - - col_mgr = self._mgr.iget(i) - result = self._box_col_values(col_mgr, i) - - # this is a cached value, mark it so - result._set_as_cached(label, self) - return result - - def _get_column_array(self, i: int) -> ArrayLike: - """ - Get the values of the i'th column (ndarray or ExtensionArray, as stored - in the Block) - """ - return self._mgr.iget_values(i) - - def _iter_column_arrays(self) -> Iterator[ArrayLike]: - """ - Iterate over the arrays of all columns in order. - This returns the values as stored in the Block (ndarray or ExtensionArray). - """ - for i in range(len(self.columns)): - yield self._get_column_array(i) - - def __getitem__(self, key): - check_deprecated_indexers(key) - key = lib.item_from_zerodim(key) - key = com.apply_if_callable(key, self) - - if is_hashable(key) and not is_iterator(key): - # is_iterator to exclude generator e.g. test_getitem_listlike - # shortcut if the key is in columns - if self.columns.is_unique and key in self.columns: - if isinstance(self.columns, MultiIndex): - return self._getitem_multilevel(key) - return self._get_item_cache(key) - - # Do we have a slicer (on rows)? - indexer = convert_to_index_sliceable(self, key) - if indexer is not None: - if isinstance(indexer, np.ndarray): - indexer = lib.maybe_indices_to_slice( - indexer.astype(np.intp, copy=False), len(self) - ) - if isinstance(indexer, np.ndarray): - # GH#43223 If we can not convert, use take - return self.take(indexer, axis=0) - # either we have a slice or we have a string that can be converted - # to a slice for partial-string date indexing - return self._slice(indexer, axis=0) - - # Do we have a (boolean) DataFrame? - if isinstance(key, DataFrame): - return self.where(key) - - # Do we have a (boolean) 1d indexer? - if com.is_bool_indexer(key): - return self._getitem_bool_array(key) - - # We are left with two options: a single key, and a collection of keys, - # We interpret tuples as collections only for non-MultiIndex - is_single_key = isinstance(key, tuple) or not is_list_like(key) - - if is_single_key: - if self.columns.nlevels > 1: - return self._getitem_multilevel(key) - indexer = self.columns.get_loc(key) - if is_integer(indexer): - indexer = [indexer] - else: - if is_iterator(key): - key = list(key) - indexer = self.columns._get_indexer_strict(key, "columns")[1] - - # take() does not accept boolean indexers - if getattr(indexer, "dtype", None) == bool: - indexer = np.where(indexer)[0] - - data = self._take_with_is_copy(indexer, axis=1) - - if is_single_key: - # What does looking for a single key in a non-unique index return? - # The behavior is inconsistent. It returns a Series, except when - # - the key itself is repeated (test on data.shape, #9519), or - # - we have a MultiIndex on columns (test on self.columns, #21309) - if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): - # GH#26490 using data[key] can cause RecursionError - return data._get_item_cache(key) - - return data - - def _getitem_bool_array(self, key): - # also raises Exception if object array with NA values - # warning here just in case -- previously __setitem__ was - # reindexing but __getitem__ was not; it seems more reasonable to - # go with the __setitem__ behavior since that is more consistent - # with all other indexing behavior - if isinstance(key, Series) and not key.index.equals(self.index): - warnings.warn( - "Boolean Series key will be reindexed to match DataFrame index.", - UserWarning, - stacklevel=find_stack_level(), - ) - elif len(key) != len(self.index): - raise ValueError( - f"Item wrong length {len(key)} instead of {len(self.index)}." - ) - - # check_bool_indexer will throw exception if Series key cannot - # be reindexed to match DataFrame rows - key = check_bool_indexer(self.index, key) - indexer = key.nonzero()[0] - return self._take_with_is_copy(indexer, axis=0) - - def _getitem_multilevel(self, key): - # self.columns is a MultiIndex - loc = self.columns.get_loc(key) - if isinstance(loc, (slice, np.ndarray)): - new_columns = self.columns[loc] - result_columns = maybe_droplevels(new_columns, key) - if self._is_mixed_type: - result = self.reindex(columns=new_columns) - result.columns = result_columns - else: - new_values = self.values[:, loc] - result = self._constructor( - new_values, index=self.index, columns=result_columns - ) - result = result.__finalize__(self) - - # If there is only one column being returned, and its name is - # either an empty string, or a tuple with an empty string as its - # first element, then treat the empty string as a placeholder - # and return the column as if the user had provided that empty - # string in the key. If the result is a Series, exclude the - # implied empty string from its name. - if len(result.columns) == 1: - top = result.columns[0] - if isinstance(top, tuple): - top = top[0] - if top == "": - result = result[""] - if isinstance(result, Series): - result = self._constructor_sliced( - result, index=self.index, name=key - ) - - result._set_is_copy(self) - return result - else: - # loc is neither a slice nor ndarray, so must be an int - return self._ixs(loc, axis=1) - - @validate_bool_kwargs_from_keywords('takeable') - def _get_value(self, index, col, takeable: bool = False) -> Scalar: - """ - Quickly retrieve single value at passed column and index. - - Parameters - ---------- - index : row label - col : column label - takeable : interpret the index/col as indexers, default False - - Returns - ------- - scalar - - Notes - ----- - Assumes that both `self.index._index_as_unique` and - `self.columns._index_as_unique`; Caller is responsible for checking. - """ - if takeable: - series = self._ixs(col, axis=1) - return series._values[index] - - series = self._get_item_cache(col) - engine = self.index._engine - - if not isinstance(self.index, MultiIndex): - # CategoricalIndex: Trying to use the engine fastpath may give incorrect - # results if our categories are integers that dont match our codes - # IntervalIndex: IntervalTree has no get_loc - row = self.index.get_loc(index) - return series._values[row] - - # For MultiIndex going through engine effectively restricts us to - # same-length tuples; see test_get_set_value_no_partial_indexing - loc = engine.get_loc(index) - return series._values[loc] - - def __setitem__(self, key, value): - key = com.apply_if_callable(key, self) - - # see if we can slice the rows - indexer = convert_to_index_sliceable(self, key) - if indexer is not None: - # either we have a slice or we have a string that can be converted - # to a slice for partial-string date indexing - return self._setitem_slice(indexer, value) - - if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: - self._setitem_frame(key, value) - elif isinstance(key, (Series, np.ndarray, list, Index)): - self._setitem_array(key, value) - elif isinstance(value, DataFrame): - self._set_item_frame_value(key, value) - elif ( - is_list_like(value) - and not self.columns.is_unique - and 1 < len(self.columns.get_indexer_for([key])) == len(value) - ): - # Column to set is duplicated - self._setitem_array([key], value) - else: - # set column - self._set_item(key, value) - - def _setitem_slice(self, key: slice, value): - # NB: we can't just use self.loc[key] = value because that - # operates on labels and we need to operate positional for - # backwards-compat, xref GH#31469 - self._check_setitem_copy() - self.iloc[key] = value - - def _setitem_array(self, key, value): - # also raises Exception if object array with NA values - if com.is_bool_indexer(key): - # bool indexer is indexing along rows - if len(key) != len(self.index): - raise ValueError( - f"Item wrong length {len(key)} instead of {len(self.index)}!" - ) - key = check_bool_indexer(self.index, key) - indexer = key.nonzero()[0] - self._check_setitem_copy() - if isinstance(value, DataFrame): - # GH#39931 reindex since iloc does not align - value = value.reindex(self.index.take(indexer)) - self.iloc[indexer] = value - - else: - # Note: unlike self.iloc[:, indexer] = value, this will - # never try to overwrite values inplace - - if isinstance(value, DataFrame): - check_key_length(self.columns, key, value) - for k1, k2 in zip(key, value.columns): - self[k1] = value[k2] - - elif not is_list_like(value): - for col in key: - self[col] = value - - elif isinstance(value, np.ndarray) and value.ndim == 2: - self._iset_not_inplace(key, value) - - elif np.ndim(value) > 1: - # list of lists - value = DataFrame(value).values - return self._setitem_array(key, value) - - else: - self._iset_not_inplace(key, value) - - def _iset_not_inplace(self, key, value): - # GH#39510 when setting with df[key] = obj with a list-like key and - # list-like value, we iterate over those listlikes and set columns - # one at a time. This is different from dispatching to - # `self.loc[:, key]= value` because loc.__setitem__ may overwrite - # data inplace, whereas this will insert new arrays. - - def igetitem(obj, i: int): - # Note: we catch DataFrame obj before getting here, but - # hypothetically would return obj.iloc[:, i] - if isinstance(obj, np.ndarray): - return obj[..., i] - else: - return obj[i] - - if self.columns.is_unique: - if np.shape(value)[-1] != len(key): - raise ValueError("Columns must be same length as key") - - for i, col in enumerate(key): - self[col] = igetitem(value, i) - - else: - - ilocs = self.columns.get_indexer_non_unique(key)[0] - if (ilocs < 0).any(): - # key entries not in self.columns - raise NotImplementedError - - if np.shape(value)[-1] != len(ilocs): - raise ValueError("Columns must be same length as key") - - assert np.ndim(value) <= 2 - - orig_columns = self.columns - - # Using self.iloc[:, i] = ... may set values inplace, which - # by convention we do not do in __setitem__ - try: - self.columns = Index(range(len(self.columns))) - for i, iloc in enumerate(ilocs): - self[iloc] = igetitem(value, i) - finally: - self.columns = orig_columns - - def _setitem_frame(self, key, value): - # support boolean setting with DataFrame input, e.g. - # df[df > df2] = 0 - if isinstance(key, np.ndarray): - if key.shape != self.shape: - raise ValueError("Array conditional must be same shape as self") - key = self._constructor(key, **self._construct_axes_dict()) - - if key.size and not is_bool_dtype(key.values): - raise TypeError( - "Must pass DataFrame or 2-d ndarray with boolean values only" - ) - - self._check_inplace_setting(value) - self._check_setitem_copy() - self._where(-key, value, inplace=True) - - def _set_item_frame_value(self, key, value: DataFrame) -> None: - self._ensure_valid_index(value) - - # align columns - if key in self.columns: - loc = self.columns.get_loc(key) - cols = self.columns[loc] - len_cols = 1 if is_scalar(cols) else len(cols) - if len_cols != len(value.columns): - raise ValueError("Columns must be same length as key") - - # align right-hand-side columns if self.columns - # is multi-index and self[key] is a sub-frame - if isinstance(self.columns, MultiIndex) and isinstance( - loc, (slice, Series, np.ndarray, Index) - ): - cols = maybe_droplevels(cols, key) - if len(cols) and not cols.equals(value.columns): - value = value.reindex(cols, axis=1) - - # now align rows - arraylike = _reindex_for_setitem(value, self.index) - self._set_item_mgr(key, arraylike) - - def _iset_item_mgr( - self, loc: int | slice | np.ndarray, value, inplace: bool = False - ) -> None: - # when called from _set_item_mgr loc can be anything returned from get_loc - self._mgr.iset(loc, value, inplace=inplace) - self._clear_item_cache() - - def _set_item_mgr(self, key, value: ArrayLike) -> None: - try: - loc = self._info_axis.get_loc(key) - except KeyError: - # This item wasn't present, just insert at end - self._mgr.insert(len(self._info_axis), key, value) - else: - self._iset_item_mgr(loc, value) - - # check if we are modifying a copy - # try to set first as we want an invalid - # value exception to occur first - if len(self): - self._check_setitem_copy() - - def _iset_item(self, loc: int, value) -> None: - arraylike = self._sanitize_column(value) - self._iset_item_mgr(loc, arraylike, inplace=True) - - # check if we are modifying a copy - # try to set first as we want an invalid - # value exception to occur first - if len(self): - self._check_setitem_copy() - - def _set_item(self, key, value) -> None: - """ - Add series to DataFrame in specified column. - - If series is a numpy-array (not a Series/TimeSeries), it must be the - same length as the DataFrames index or an error will be thrown. - - Series/TimeSeries will be conformed to the DataFrames index to - ensure homogeneity. - """ - value = self._sanitize_column(value) - - if ( - key in self.columns - and value.ndim == 1 - and not is_extension_array_dtype(value) - ): - # broadcast across multiple columns if necessary - if not self.columns.is_unique or isinstance(self.columns, MultiIndex): - existing_piece = self[key] - if isinstance(existing_piece, DataFrame): - value = np.tile(value, (len(existing_piece.columns), 1)).T - - self._set_item_mgr(key, value) - - def _set_value( - self, index: IndexLabel, col, value: Scalar, takeable: bool = False - ) -> None: - """ - Put single value at passed column and index. - - Parameters - ---------- - index : Label - row label - col : Label - column label - value : scalar - takeable : bool, default False - Sets whether or not index/col interpreted as indexers - """ - try: - if takeable: - series = self._ixs(col, axis=1) - loc = index - else: - series = self._get_item_cache(col) - loc = self.index.get_loc(index) - - # setitem_inplace will do validation that may raise TypeError, - # ValueError, or LossySetitemError - series._mgr.setitem_inplace(loc, value) - - except (KeyError, TypeError, ValueError, LossySetitemError): - # set using a non-recursive method & reset the cache - if takeable: - self.iloc[index, col] = value - else: - self.loc[index, col] = value - self._item_cache.pop(col, None) - - def _ensure_valid_index(self, value) -> None: - """ - Ensure that if we don't have an index, that we can create one from the - passed value. - """ - # GH5632, make sure that we are a Series convertible - if not len(self.index) and is_list_like(value) and len(value): - if not isinstance(value, DataFrame): - try: - value = Series(value) - except (ValueError, NotImplementedError, TypeError) as err: - raise ValueError( - "Cannot set a frame with no defined index " - "and a value that cannot be converted to a Series" - ) from err - - # GH31368 preserve name of index - index_copy = value.index.copy() - if self.index.name is not None: - index_copy.name = self.index.name - - self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) - - def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: - """ - Provide boxed values for a column. - """ - # Lookup in columns so that if e.g. a str datetime was passed - # we attach the Timestamp object as the name. - name = self.columns[loc] - klass = self._constructor_sliced - # We get index=self.index bc values is a SingleDataManager - return klass(values, name=name, fastpath=True).__finalize__(self) - - # ---------------------------------------------------------------------- - # Lookup Caching - - def _clear_item_cache(self) -> None: - self._item_cache.clear() - - def _get_item_cache(self, item: Hashable) -> Series: - """Return the cached item, item represents a label indexer.""" - cache = self._item_cache - res = cache.get(item) - if res is None: - # All places that call _get_item_cache have unique columns, - # pending resolution of GH#33047 - - loc = self.columns.get_loc(item) - res = self._ixs(loc, axis=1) - - cache[item] = res - - # for a chain - res._is_copy = self._is_copy - return res - - def _reset_cacher(self) -> None: - # no-op for DataFrame - pass - - @validate_bool_kwargs_from_keywords('inplace') - def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: - """ - The object has called back to us saying maybe it has changed. - """ - loc = self._info_axis.get_loc(item) - arraylike = value._values - - old = self._ixs(loc, axis=1) - if old._values is value._values and inplace: - # GH#46149 avoid making unnecessary copies/block-splitting - return - - self._mgr.iset(loc, arraylike, inplace=inplace) - - # ---------------------------------------------------------------------- - # Unsorted - - @validate_bool_kwargs_from_keywords('inplace') - def query(self, expr: str, inplace: bool = False, **kwargs): - """ - Query the columns of a DataFrame with a boolean expression. - - Parameters - ---------- - expr : str - The query string to evaluate. - - You can refer to variables - in the environment by prefixing them with an '@' character like - ``@a + b``. - - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuations (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2)" would - be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "list", "for", "import", etc) cannot be used. - - For example, if one of your columns is called ``a a`` and you want - to sum it with ``b``, your query should be ```a a` + b``. - - .. versionadded:: 0.25.0 - Backtick quoting introduced. - - .. versionadded:: 1.0.0 - Expanding functionality of backtick quoting for more than only spaces. - - inplace : bool - Whether the query should modify the data in place or return - a modified copy. - **kwargs - See the documentation for :func:`eval` for complete details - on the keyword arguments accepted by :meth:`DataFrame.query`. - - Returns - ------- - DataFrame or None - DataFrame resulting from the provided query expression or - None if ``inplace=True``. - - See Also - -------- - eval : Evaluate a string describing operations on - DataFrame columns. - DataFrame.eval : Evaluate a string describing operations on - DataFrame columns. - - Notes - ----- - The result of the evaluation of this expression is first passed to - :attr:`DataFrame.loc` and if that fails because of a - multidimensional key (e.g., a DataFrame) then the result will be passed - to :meth:`DataFrame.__getitem__`. - - This method uses the top-level :func:`eval` function to - evaluate the passed query. - - The :meth:`~pandas.DataFrame.query` method uses a slightly - modified Python syntax by default. For example, the ``&`` and ``|`` - (bitwise) operators have the precedence of their boolean cousins, - :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, - however the semantics are different. - - You can change the semantics of the expression by passing the keyword - argument ``parser='python'``. This enforces the same semantics as - evaluation in Python space. Likewise, you can pass ``engine='python'`` - to evaluate an expression using Python itself as a backend. This is not - recommended as it is inefficient compared to using ``numexpr`` as the - engine. - - The :attr:`DataFrame.index` and - :attr:`DataFrame.columns` attributes of the - :class:`~pandas.DataFrame` instance are placed in the query namespace - by default, which allows you to treat both the index and columns of the - frame as a column in the frame. - The identifier ``index`` is used for the frame index; you can also - use the name of the index to identify it in a query. Please note that - Python keywords may not be used as identifiers. - - For further details and examples see the ``query`` documentation in - :ref:`indexing `. - - *Backtick quoted variables* - - Backtick quoted variables are parsed as literal Python code and - are converted internally to a Python valid identifier. - This can lead to the following problems. - - During parsing a number of disallowed characters inside the backtick - quoted string are replaced by strings that are allowed as a Python identifier. - These characters include all operators in Python, the space character, the - question mark, the exclamation mark, the dollar sign, and the euro sign. - For other characters that fall outside the ASCII range (U+0001..U+007F) - and those that are not further specified in PEP 3131, - the query parser will raise an error. - This excludes whitespace different than the space character, - but also the hashtag (as it is used for comments) and the backtick - itself (backtick can also not be escaped). - - In a special case, quotes that make a pair around a backtick can - confuse the parser. - For example, ```it's` > `that's``` will raise an error, - as it forms a quoted string (``'s > `that'``) with a backtick inside. - - See also the Python documentation about lexical analysis - (https://docs.python.org/3/reference/lexical_analysis.html) - in combination with the source code in :mod:`pandas.core.computation.parsing`. - - Examples - -------- - >>> df = pd.DataFrame({'A': range(1, 6), - ... 'B': range(10, 0, -2), - ... 'C C': range(10, 5, -1)}) - >>> df - A B C C - 0 1 10 10 - 1 2 8 9 - 2 3 6 8 - 3 4 4 7 - 4 5 2 6 - >>> df.query('A > B') - A B C C - 4 5 2 6 - - The previous expression is equivalent to - - >>> df[df.A > df.B] - A B C C - 4 5 2 6 - - For columns with spaces in their name, you can use backtick quoting. - - >>> df.query('B == `C C`') - A B C C - 0 1 10 10 - - The previous expression is equivalent to - - >>> df[df.B == df['C C']] - A B C C - 0 1 10 10 - """ - inplace = validate_bool_kwarg(inplace, "inplace") - if not isinstance(expr, str): - msg = f"expr must be a string to be evaluated, {type(expr)} given" - raise ValueError(msg) - kwargs["level"] = kwargs.pop("level", 0) + 1 - kwargs["target"] = None - res = self.eval(expr, **kwargs) - - try: - result = self.loc[res] - except ValueError: - # when res is multi-dimensional loc raises, but this is sometimes a - # valid query - result = self[res] - - if inplace: - self._update_inplace(result) - return None - else: - return result - - @validate_bool_kwargs_from_keywords('inplace') - def eval(self, expr: str, inplace: bool = False, **kwargs): - """ - Evaluate a string describing operations on DataFrame columns. - - Operates on columns only, not specific rows or elements. This allows - `eval` to run arbitrary code, which can make you vulnerable to code - injection if you pass user input to this function. - - Parameters - ---------- - expr : str - The expression string to evaluate. - inplace : bool, default False - If the expression contains an assignment, whether to perform the - operation inplace and mutate the existing DataFrame. Otherwise, - a new DataFrame is returned. - **kwargs - See the documentation for :func:`eval` for complete details - on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. - - Returns - ------- - ndarray, scalar, pandas object, or None - The result of the evaluation or None if ``inplace=True``. - - See Also - -------- - DataFrame.query : Evaluates a boolean expression to query the columns - of a frame. - DataFrame.assign : Can evaluate an expression or function to create new - values for a column. - eval : Evaluate a Python expression as a string using various - backends. - - Notes - ----- - For more details see the API documentation for :func:`~eval`. - For detailed examples see :ref:`enhancing performance with eval - `. - - Examples - -------- - >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) - >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 - >>> df.eval('A + B') - 0 11 - 1 10 - 2 9 - 3 8 - 4 7 - dtype: int64 - - Assignment is allowed though by default the original DataFrame is not - modified. - - >>> df.eval('C = A + B') - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 - >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 - - Use ``inplace=True`` to modify the original DataFrame. - - >>> df.eval('C = A + B', inplace=True) - >>> df - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 - - Multiple columns can be assigned to using multi-line expressions: - - >>> df.eval( - ... ''' - ... C = A + B - ... D = A - B - ... ''' - ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 - """ - from pandas.core.computation.eval import eval as _eval - - inplace = validate_bool_kwarg(inplace, "inplace") - kwargs["level"] = kwargs.pop("level", 0) + 1 - index_resolvers = self._get_index_resolvers() - column_resolvers = self._get_cleaned_column_resolvers() - resolvers = column_resolvers, index_resolvers - if "target" not in kwargs: - kwargs["target"] = self - kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers - - return _eval(expr, inplace=inplace, **kwargs) - - def select_dtypes(self, include=None, exclude=None) -> DataFrame: - """ - Return a subset of the DataFrame's columns based on the column dtypes. - - Parameters - ---------- - include, exclude : scalar or list-like - A selection of dtypes or strings to be included/excluded. At least - one of these parameters must be supplied. - - Returns - ------- - DataFrame - The subset of the frame including the dtypes in ``include`` and - excluding the dtypes in ``exclude``. - - Raises - ------ - ValueError - * If both of ``include`` and ``exclude`` are empty - * If ``include`` and ``exclude`` have overlapping elements - * If any kind of string dtype is passed in. - - See Also - -------- - DataFrame.dtypes: Return Series with the data type of each column. - - Notes - ----- - * To select all *numeric* types, use ``np.number`` or ``'number'`` - * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns - * See the `numpy dtype hierarchy - `__ - * To select datetimes, use ``np.datetime64``, ``'datetime'`` or - ``'datetime64'`` - * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or - ``'timedelta64'`` - * To select Pandas categorical dtypes, use ``'category'`` - * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in - 0.20.0) or ``'datetime64[ns, tz]'`` - - Examples - -------- - >>> df = pd.DataFrame({'a': [1, 2] * 3, - ... 'b': [True, False] * 3, - ... 'c': [1.0, 2.0] * 3}) - >>> df - a b c - 0 1 True 1.0 - 1 2 False 2.0 - 2 1 True 1.0 - 3 2 False 2.0 - 4 1 True 1.0 - 5 2 False 2.0 - - >>> df.select_dtypes(include='bool') - b - 0 True - 1 False - 2 True - 3 False - 4 True - 5 False - - >>> df.select_dtypes(include=['float64']) - c - 0 1.0 - 1 2.0 - 2 1.0 - 3 2.0 - 4 1.0 - 5 2.0 - - >>> df.select_dtypes(exclude=['int64']) - b c - 0 True 1.0 - 1 False 2.0 - 2 True 1.0 - 3 False 2.0 - 4 True 1.0 - 5 False 2.0 - """ - if not is_list_like(include): - include = (include,) if include is not None else () - if not is_list_like(exclude): - exclude = (exclude,) if exclude is not None else () - - selection = (frozenset(include), frozenset(exclude)) - - if not any(selection): - raise ValueError("at least one of include or exclude must be nonempty") - - # convert the myriad valid dtypes object to a single representation - def check_int_infer_dtype(dtypes): - converted_dtypes: list[type] = [] - for dtype in dtypes: - # Numpy maps int to different types (int32, in64) on Windows and Linux - # see https://github.com/numpy/numpy/issues/9464 - if (isinstance(dtype, str) and dtype == "int") or (dtype is int): - converted_dtypes.append(np.int32) - converted_dtypes.append(np.int64) - elif dtype == "float" or dtype is float: - # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20 - converted_dtypes.extend([np.float64, np.float32]) - else: - converted_dtypes.append(infer_dtype_from_object(dtype)) - return frozenset(converted_dtypes) - - include = check_int_infer_dtype(include) - exclude = check_int_infer_dtype(exclude) - - for dtypes in (include, exclude): - invalidate_string_dtypes(dtypes) - - # can't both include AND exclude! - if not include.isdisjoint(exclude): - raise ValueError(f"include and exclude overlap on {(include & exclude)}") - - def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: - return issubclass(dtype.type, tuple(dtypes_set)) or ( - np.number in dtypes_set and getattr(dtype, "_is_numeric", False) - ) - - def predicate(arr: ArrayLike) -> bool: - dtype = arr.dtype - if include: - if not dtype_predicate(dtype, include): - return False - - if exclude: - if dtype_predicate(dtype, exclude): - return False - - return True - - mgr = self._mgr._get_data_subset(predicate) - return type(self)(mgr).__finalize__(self) - - def insert( - self, - loc: int, - column: Hashable, - value: Scalar | AnyArrayLike, - allow_duplicates: bool | lib.NoDefault = lib.no_default, - ) -> None: - """ - Insert column into DataFrame at specified location. - - Raises a ValueError if `column` is already contained in the DataFrame, - unless `allow_duplicates` is set to True. - - Parameters - ---------- - loc : int - Insertion index. Must verify 0 <= loc <= len(columns). - column : str, number, or hashable object - Label of the inserted column. - value : Scalar, Series, or array-like - allow_duplicates : bool, optional, default lib.no_default - - See Also - -------- - Index.insert : Insert new item by index. - - Examples - -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - >>> df - col1 col2 - 0 1 3 - 1 2 4 - >>> df.insert(1, "newcol", [99, 99]) - >>> df - col1 newcol col2 - 0 1 99 3 - 1 2 99 4 - >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) - >>> df - col1 col1 newcol col2 - 0 100 1 99 3 - 1 100 2 99 4 - - Notice that pandas uses index alignment in case of `value` from type `Series`: - - >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) - >>> df - col0 col1 col1 newcol col2 - 0 NaN 100 1 99 3 - 1 5.0 100 2 99 4 - """ - if allow_duplicates is lib.no_default: - allow_duplicates = False - if allow_duplicates and not self.flags.allows_duplicate_labels: - raise ValueError( - "Cannot specify 'allow_duplicates=True' when " - "'self.flags.allows_duplicate_labels' is False." - ) - if not allow_duplicates and column in self.columns: - # Should this be a different kind of error?? - raise ValueError(f"cannot insert {column}, already exists") - if not isinstance(loc, int): - raise TypeError("loc must be int") - - value = self._sanitize_column(value) - self._mgr.insert(loc, column, value) - - def assign(self, **kwargs) -> DataFrame: - r""" - Assign new columns to a DataFrame. - - Returns a new object with all original columns in addition to new ones. - Existing columns that are re-assigned will be overwritten. - - Parameters - ---------- - **kwargs : dict of {str: callable or Series} - The column names are keywords. If the values are - callable, they are computed on the DataFrame and - assigned to the new columns. The callable must not - change input DataFrame (though pandas doesn't check it). - If the values are not callable, (e.g. a Series, scalar, or array), - they are simply assigned. - - Returns - ------- - DataFrame - A new DataFrame with the new columns in addition to - all the existing columns. - - Notes - ----- - Assigning multiple columns within the same ``assign`` is possible. - Later items in '\*\*kwargs' may refer to newly created or modified - columns in 'df'; items are computed and assigned into 'df' in order. - - Examples - -------- - >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, - ... index=['Portland', 'Berkeley']) - >>> df - temp_c - Portland 17.0 - Berkeley 25.0 - - Where the value is a callable, evaluated on `df`: - - >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) - temp_c temp_f - Portland 17.0 62.6 - Berkeley 25.0 77.0 - - Alternatively, the same behavior can be achieved by directly - referencing an existing Series or sequence: - - >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) - temp_c temp_f - Portland 17.0 62.6 - Berkeley 25.0 77.0 - - You can create multiple columns within the same assign where one - of the columns depends on another one defined within the same assign: - - >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, - ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) - temp_c temp_f temp_k - Portland 17.0 62.6 290.15 - Berkeley 25.0 77.0 298.15 - """ - data = self.copy() - - for k, v in kwargs.items(): - data[k] = com.apply_if_callable(v, data) - return data - - def _sanitize_column(self, value) -> ArrayLike: - """ - Ensures new columns (which go into the BlockManager as new blocks) are - always copied and converted into an array. - - Parameters - ---------- - value : scalar, Series, or array-like - - Returns - ------- - numpy.ndarray or ExtensionArray - """ - self._ensure_valid_index(value) - - # We should never get here with DataFrame value - if isinstance(value, Series): - return _reindex_for_setitem(value, self.index) - - if is_list_like(value): - com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True) - - @property - def _series(self): - return { - item: Series( - self._mgr.iget(idx), index=self.index, name=item, fastpath=True - ) - for idx, item in enumerate(self.columns) - } - - def lookup( - self, row_labels: Sequence[IndexLabel], col_labels: Sequence[IndexLabel] - ) -> np.ndarray: - """ - Label-based "fancy indexing" function for DataFrame. - Given equal-length arrays of row and column labels, return an - array of the values corresponding to each (row, col) pair. - - .. deprecated:: 1.2.0 - DataFrame.lookup is deprecated, - use pandas.factorize and NumPy indexing instead. - For further details see - :ref:`Looking up values by index/column labels `. - - Parameters - ---------- - row_labels : sequence - The row labels to use for lookup. - col_labels : sequence - The column labels to use for lookup. - - Returns - ------- - numpy.ndarray - The found values. - """ - msg = ( - "The 'lookup' method is deprecated and will be " - "removed in a future version. " - "You can use DataFrame.melt and DataFrame.loc " - "as a substitute." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - - n = len(row_labels) - if n != len(col_labels): - raise ValueError("Row labels must have same size as column labels") - if not (self.index.is_unique and self.columns.is_unique): - # GH#33041 - raise ValueError("DataFrame.lookup requires unique index and columns") - - thresh = 1000 - if not self._is_mixed_type or n > thresh: - values = self.values - ridx = self.index.get_indexer(row_labels) - cidx = self.columns.get_indexer(col_labels) - if (ridx == -1).any(): - raise KeyError("One or more row labels was not found") - if (cidx == -1).any(): - raise KeyError("One or more column labels was not found") - flat_index = ridx * len(self.columns) + cidx - result = values.flat[flat_index] - else: - result = np.empty(n, dtype="O") - for i, (r, c) in enumerate(zip(row_labels, col_labels)): - result[i] = self._get_value(r, c) - - if is_object_dtype(result): - result = lib.maybe_convert_objects(result) - - return result - - # ---------------------------------------------------------------------- - # Reindexing and alignment - - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): - frame = self - - columns = axes["columns"] - if columns is not None: - frame = frame._reindex_columns( - columns, method, copy, level, fill_value, limit, tolerance - ) - - index = axes["index"] - if index is not None: - frame = frame._reindex_index( - index, method, copy, level, fill_value, limit, tolerance - ) - - return frame - - def _reindex_index( - self, - new_index, - method, - copy: bool, - level: Level, - fill_value=np.nan, - limit=None, - tolerance=None, - ): - new_index, indexer = self.index.reindex( - new_index, method=method, level=level, limit=limit, tolerance=tolerance - ) - return self._reindex_with_indexers( - {0: [new_index, indexer]}, - copy=copy, - fill_value=fill_value, - allow_dups=False, - ) - - def _reindex_columns( - self, - new_columns, - method, - copy: bool, - level: Level, - fill_value=None, - limit=None, - tolerance=None, - ): - new_columns, indexer = self.columns.reindex( - new_columns, method=method, level=level, limit=limit, tolerance=tolerance - ) - return self._reindex_with_indexers( - {1: [new_columns, indexer]}, - copy=copy, - fill_value=fill_value, - allow_dups=False, - ) - - def _reindex_multi( - self, axes: dict[str, Index], copy: bool, fill_value - ) -> DataFrame: - """ - We are guaranteed non-Nones in the axes. - """ - - new_index, row_indexer = self.index.reindex(axes["index"]) - new_columns, col_indexer = self.columns.reindex(axes["columns"]) - - if row_indexer is not None and col_indexer is not None: - # Fastpath. By doing two 'take's at once we avoid making an - # unnecessary copy. - # We only get here with `not self._is_mixed_type`, which (almost) - # ensures that self.values is cheap. It may be worth making this - # condition more specific. - indexer = row_indexer, col_indexer - new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) - return self._constructor(new_values, index=new_index, columns=new_columns) - else: - return self._reindex_with_indexers( - {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, - copy=copy, - fill_value=fill_value, - ) - - @doc(NDFrame.align, **_shared_doc_kwargs) - @validate_bool_kwargs_from_keywords('copy') - def align( - self, - other, - join: str = "outer", - axis: Axis | None = None, - level: Level | None = None, - copy: bool = True, - fill_value=None, - method: str | None = None, - limit=None, - fill_axis: Axis = 0, - broadcast_axis: Axis | None = None, - ) -> DataFrame: - return super().align( - other, - join=join, - axis=axis, - level=level, - copy=copy, - fill_value=fill_value, - method=method, - limit=limit, - fill_axis=fill_axis, - broadcast_axis=broadcast_axis, - ) - - @overload - def set_axis( - self, labels, axis: Axis = ..., inplace: Literal[False] = ... - ) -> DataFrame: - ... - - @overload - def set_axis(self, labels, axis: Axis, inplace: Literal[True]) -> None: - ... - - @overload - def set_axis(self, labels, *, inplace: Literal[True]) -> None: - ... - - @overload - def set_axis( - self, labels, axis: Axis = ..., inplace: bool = ... - ) -> DataFrame | None: - ... - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) - @Appender( - """ - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - Change the row labels. - - >>> df.set_axis(['a', 'b', 'c'], axis='index') - A B - a 1 4 - b 2 5 - c 3 6 - - Change the column labels. - - >>> df.set_axis(['I', 'II'], axis='columns') - I II - 0 1 4 - 1 2 5 - 2 3 6 - - Now, update the labels inplace. - - >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) - >>> df - i ii - 0 1 4 - 1 2 5 - 2 3 6 - """ - ) - @Substitution( - **_shared_doc_kwargs, - extended_summary_sub=" column or", - axis_description_sub=", and 1 identifies the columns", - see_also_sub=" or columns", - ) - @Appender(NDFrame.set_axis.__doc__) - @validate_bool_kwargs_from_keywords('inplace') - def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): - return super().set_axis(labels, axis=axis, inplace=inplace) - - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.reindex.__doc__) - @rewrite_axis_style_signature( - "labels", - [ - ("method", None), - ("copy", True), - ("level", None), - ("fill_value", np.nan), - ("limit", None), - ("tolerance", None), - ], - ) - def reindex(self, *args, **kwargs) -> DataFrame: - axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") - kwargs.update(axes) - # Pop these, since the values are in `kwargs` under different names - kwargs.pop("axis", None) - kwargs.pop("labels", None) - return super().reindex(**kwargs) - - @overload - def drop( - self, - labels: Hashable | list[Hashable] = ..., - *, - axis: Axis = ..., - index: Hashable | list[Hashable] = ..., - columns: Hashable | list[Hashable] = ..., - level: Level | None = ..., - inplace: Literal[True], - errors: IgnoreRaise = ..., - ) -> None: - ... - - @overload - def drop( - self, - labels: Hashable | list[Hashable] = ..., - *, - axis: Axis = ..., - index: Hashable | list[Hashable] = ..., - columns: Hashable | list[Hashable] = ..., - level: Level | None = ..., - inplace: Literal[False] = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame: - ... - - @overload - def drop( - self, - labels: Hashable | list[Hashable] = ..., - *, - axis: Axis = ..., - index: Hashable | list[Hashable] = ..., - columns: Hashable | list[Hashable] = ..., - level: Level | None = ..., - inplace: bool = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame | None: - ... - - # error: Signature of "drop" incompatible with supertype "NDFrame" - # github.com/python/mypy/issues/12387 - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) - @validate_bool_kwargs_from_keywords('inplace') - def drop( # type: ignore[override] - self, - labels: Hashable | list[Hashable] = None, - axis: Axis = 0, - index: Hashable | list[Hashable] = None, - columns: Hashable | list[Hashable] = None, - level: Level | None = None, - inplace: bool = False, - errors: IgnoreRaise = "raise", - ) -> DataFrame | None: - """ - Drop specified labels from rows or columns. - - Remove rows or columns by specifying label names and corresponding - axis, or by specifying directly index or column names. When using a - multi-index, labels on different levels can be removed by specifying - the level. See the `user guide ` - for more information about the now unused levels. - - Parameters - ---------- - labels : single label or list-like - Index or column labels to drop. A tuple will be used as a single - label and not treated as a list-like. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Whether to drop labels from the index (0 or 'index') or - columns (1 or 'columns'). - index : single label or list-like - Alternative to specifying axis (``labels, axis=0`` - is equivalent to ``index=labels``). - columns : single label or list-like - Alternative to specifying axis (``labels, axis=1`` - is equivalent to ``columns=labels``). - level : int or level name, optional - For MultiIndex, level from which the labels will be removed. - inplace : bool, default False - If False, return a copy. Otherwise, do operation - inplace and return None. - errors : {'ignore', 'raise'}, default 'raise' - If 'ignore', suppress error and only existing labels are - dropped. - - Returns - ------- - DataFrame or None - DataFrame without the removed index or column labels or - None if ``inplace=True``. - - Raises - ------ - KeyError - If any of the labels is not found in the selected axis. - - See Also - -------- - DataFrame.loc : Label-location based indexer for selection by label. - DataFrame.dropna : Return DataFrame with labels on given axis omitted - where (all or any) data are missing. - DataFrame.drop_duplicates : Return DataFrame with duplicate rows - removed, optionally only considering certain columns. - Series.drop : Return Series with specified index labels removed. - - Examples - -------- - >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), - ... columns=['A', 'B', 'C', 'D']) - >>> df - A B C D - 0 0 1 2 3 - 1 4 5 6 7 - 2 8 9 10 11 - - Drop columns - - >>> df.drop(['B', 'C'], axis=1) - A D - 0 0 3 - 1 4 7 - 2 8 11 - - >>> df.drop(columns=['B', 'C']) - A D - 0 0 3 - 1 4 7 - 2 8 11 - - Drop a row by index - - >>> df.drop([0, 1]) - A B C D - 2 8 9 10 11 - - Drop columns and/or rows of MultiIndex DataFrame - - >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], - ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], - ... [250, 150], [1.5, 0.8], [320, 250], - ... [1, 0.8], [0.3, 0.2]]) - >>> df - big small - lama speed 45.0 30.0 - weight 200.0 100.0 - length 1.5 1.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - length 1.5 0.8 - falcon speed 320.0 250.0 - weight 1.0 0.8 - length 0.3 0.2 - - Drop a specific index combination from the MultiIndex - DataFrame, i.e., drop the combination ``'falcon'`` and - ``'weight'``, which deletes only the corresponding row - - >>> df.drop(index=('falcon', 'weight')) - big small - lama speed 45.0 30.0 - weight 200.0 100.0 - length 1.5 1.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - length 1.5 0.8 - falcon speed 320.0 250.0 - length 0.3 0.2 - - >>> df.drop(index='cow', columns='small') - big - lama speed 45.0 - weight 200.0 - length 1.5 - falcon speed 320.0 - weight 1.0 - length 0.3 - - >>> df.drop(index='length', level=1) - big small - lama speed 45.0 30.0 - weight 200.0 100.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - falcon speed 320.0 250.0 - weight 1.0 0.8 - """ - return super().drop( - labels=labels, - axis=axis, - index=index, - columns=columns, - level=level, - inplace=inplace, - errors=errors, - ) - - @overload - def rename( - self, - mapper: Renamer | None = ..., - *, - index: Renamer | None = ..., - columns: Renamer | None = ..., - axis: Axis | None = ..., - copy: bool = ..., - inplace: Literal[True], - level: Level | None = ..., - errors: IgnoreRaise = ..., - ) -> None: - ... - - @overload - def rename( - self, - mapper: Renamer | None = ..., - *, - index: Renamer | None = ..., - columns: Renamer | None = ..., - axis: Axis | None = ..., - copy: bool = ..., - inplace: Literal[False] = ..., - level: Level | None = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame: - ... - - @overload - def rename( - self, - mapper: Renamer | None = ..., - *, - index: Renamer | None = ..., - columns: Renamer | None = ..., - axis: Axis | None = ..., - copy: bool = ..., - inplace: bool = ..., - level: Level | None = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame | None: - ... - - @validate_bool_kwargs_from_keywords('inplace', 'copy') - def rename( - self, - mapper: Renamer | None = None, - *, - index: Renamer | None = None, - columns: Renamer | None = None, - axis: Axis | None = None, - copy: bool = True, - inplace: bool = False, - level: Level | None = None, - errors: IgnoreRaise = "ignore", - ) -> DataFrame | None: - """ - Alter axes labels. - - Function / dict values must be unique (1-to-1). Labels not contained in - a dict / Series will be left as-is. Extra labels listed don't throw an - error. - - See the :ref:`user guide ` for more. - - Parameters - ---------- - mapper : dict-like or function - Dict-like or function transformations to apply to - that axis' values. Use either ``mapper`` and ``axis`` to - specify the axis to target with ``mapper``, or ``index`` and - ``columns``. - index : dict-like or function - Alternative to specifying axis (``mapper, axis=0`` - is equivalent to ``index=mapper``). - columns : dict-like or function - Alternative to specifying axis (``mapper, axis=1`` - is equivalent to ``columns=mapper``). - axis : {0 or 'index', 1 or 'columns'}, default 0 - Axis to target with ``mapper``. Can be either the axis name - ('index', 'columns') or number (0, 1). The default is 'index'. - copy : bool, default True - Also copy underlying data. - inplace : bool, default False - Whether to return a new DataFrame. If True then value of copy is - ignored. - level : int or level name, default None - In case of a MultiIndex, only rename labels in the specified - level. - errors : {'ignore', 'raise'}, default 'ignore' - If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, - or `columns` contains labels that are not present in the Index - being transformed. - If 'ignore', existing keys will be renamed and extra keys will be - ignored. - - Returns - ------- - DataFrame or None - DataFrame with the renamed axis labels or None if ``inplace=True``. - - Raises - ------ - KeyError - If any of the labels is not found in the selected axis and - "errors='raise'". - - See Also - -------- - DataFrame.rename_axis : Set the name of the axis. - - Examples - -------- - ``DataFrame.rename`` supports two calling conventions - - * ``(index=index_mapper, columns=columns_mapper, ...)`` - * ``(mapper, axis={'index', 'columns'}, ...)`` - - We *highly* recommend using keyword arguments to clarify your - intent. - - Rename columns using a mapping: - - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - >>> df.rename(columns={"A": "a", "B": "c"}) - a c - 0 1 4 - 1 2 5 - 2 3 6 - - Rename index using a mapping: - - >>> df.rename(index={0: "x", 1: "y", 2: "z"}) - A B - x 1 4 - y 2 5 - z 3 6 - - Cast index labels to a different type: - - >>> df.index - RangeIndex(start=0, stop=3, step=1) - >>> df.rename(index=str).index - Index(['0', '1', '2'], dtype='object') - - >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") - Traceback (most recent call last): - KeyError: ['C'] not found in axis - - Using axis-style parameters: - - >>> df.rename(str.lower, axis='columns') - a b - 0 1 4 - 1 2 5 - 2 3 6 - - >>> df.rename({1: 2, 2: 4}, axis='index') - A B - 0 1 4 - 2 2 5 - 4 3 6 - """ - return super()._rename( - mapper=mapper, - index=index, - columns=columns, - axis=axis, - copy=copy, - inplace=inplace, - level=level, - errors=errors, - ) - - @overload - def fillna( - self, - value=..., - method: FillnaOptions | None = ..., - axis: Axis | None = ..., - inplace: Literal[False] = ..., - limit=..., - downcast=..., - ) -> DataFrame: - ... - - @overload - def fillna( - self, - value, - method: FillnaOptions | None, - axis: Axis | None, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - *, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - value, - *, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - *, - method: FillnaOptions | None, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - *, - axis: Axis | None, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - *, - method: FillnaOptions | None, - axis: Axis | None, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - value, - *, - axis: Axis | None, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - value, - method: FillnaOptions | None, - *, - inplace: Literal[True], - limit=..., - downcast=..., - ) -> None: - ... - - @overload - def fillna( - self, - value=..., - method: FillnaOptions | None = ..., - axis: Axis | None = ..., - inplace: bool = ..., - limit=..., - downcast=..., - ) -> DataFrame | None: - ... - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) - @doc(NDFrame.fillna, **_shared_doc_kwargs) - @validate_bool_kwargs_from_keywords('inplace') - def fillna( - self, - value: object | ArrayLike | None = None, - method: FillnaOptions | None = None, - axis: Axis | None = None, - inplace: bool = False, - limit=None, - downcast=None, - ) -> DataFrame | None: - return super().fillna( - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - - def pop(self, item: Hashable) -> Series: - """ - Return item and drop from frame. Raise KeyError if not found. - - Parameters - ---------- - item : label - Label of column to be popped. - - Returns - ------- - Series - - Examples - -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan)], - ... columns=('name', 'class', 'max_speed')) - >>> df - name class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal NaN - - >>> df.pop('class') - 0 bird - 1 bird - 2 mammal - 3 mammal - Name: class, dtype: object - - >>> df - name max_speed - 0 falcon 389.0 - 1 parrot 24.0 - 2 lion 80.5 - 3 monkey NaN - """ - return super().pop(item=item) - - @validate_bool_kwargs_from_keywords('inplace') - @doc(NDFrame.replace, **_shared_doc_kwargs) - def replace( - self, - to_replace=None, - value=lib.no_default, - inplace: bool = False, - limit=None, - regex: bool = False, - method: str | lib.NoDefault = lib.no_default, - ): - return super().replace( - to_replace=to_replace, - value=value, - inplace=inplace, - limit=limit, - regex=regex, - method=method, - ) - - def _replace_columnwise( - self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex - ): - """ - Dispatch to Series.replace column-wise. - - Parameters - ---------- - mapping : dict - of the form {col: (target, value)} - inplace : bool - regex : bool or same types as `to_replace` in DataFrame.replace - - Returns - ------- - DataFrame or None - """ - # Operate column-wise - res = self if inplace else self.copy() - ax = self.columns - - for i in range(len(ax)): - if ax[i] in mapping: - ser = self.iloc[:, i] - - target, value = mapping[ax[i]] - newobj = ser.replace(target, value, regex=regex) - - res.iloc[:, i] = newobj - - if inplace: - return - return res.__finalize__(self) - - @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift( - self, - periods=1, - freq: Frequency | None = None, - axis: Axis = 0, - fill_value=lib.no_default, - ) -> DataFrame: - axis = self._get_axis_number(axis) - - ncols = len(self.columns) - if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: - # We will infer fill_value to match the closest column - - # Use a column that we know is valid for our column's dtype GH#38434 - label = self.columns[0] - - if periods > 0: - result = self.iloc[:, :-periods] - for col in range(min(ncols, abs(periods))): - # TODO(EA2D): doing this in a loop unnecessary with 2D EAs - # Define filler inside loop so we get a copy - filler = self.iloc[:, 0].shift(len(self)) - result.insert(0, label, filler, allow_duplicates=True) - else: - result = self.iloc[:, -periods:] - for col in range(min(ncols, abs(periods))): - # Define filler inside loop so we get a copy - filler = self.iloc[:, -1].shift(len(self)) - result.insert( - len(result.columns), label, filler, allow_duplicates=True - ) - - result.columns = self.columns.copy() - return result - elif ( - axis == 1 - and periods != 0 - and fill_value is not lib.no_default - and ncols > 0 - ): - arrays = self._mgr.arrays - if len(arrays) > 1 or ( - # If we only have one block and we know that we can't - # keep the same dtype (i.e. the _can_hold_element check) - # then we can go through the reindex_indexer path - # (and avoid casting logic in the Block method). - # The exception to this (until 2.0) is datetimelike - # dtypes with integers, which cast. - not can_hold_element(arrays[0], fill_value) - # TODO(2.0): remove special case for integer-with-datetimelike - # once deprecation is enforced - and not ( - lib.is_integer(fill_value) and needs_i8_conversion(arrays[0].dtype) - ) - ): - # GH#35488 we need to watch out for multi-block cases - # We only get here with fill_value not-lib.no_default - nper = abs(periods) - nper = min(nper, ncols) - if periods > 0: - indexer = np.array( - [-1] * nper + list(range(ncols - periods)), dtype=np.intp - ) - else: - indexer = np.array( - list(range(nper, ncols)) + [-1] * nper, dtype=np.intp - ) - mgr = self._mgr.reindex_indexer( - self.columns, - indexer, - axis=0, - fill_value=fill_value, - allow_dups=True, - ) - res_df = self._constructor(mgr) - return res_df.__finalize__(self, method="shift") - - return super().shift( - periods=periods, freq=freq, axis=axis, fill_value=fill_value - ) - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"]) - @validate_bool_kwargs_from_keywords('inplace', 'drop', 'append', 'verify_integrity') - def set_index( - self, - keys, - drop: bool = True, - append: bool = False, - inplace: bool = False, - verify_integrity: bool = False, - ): - """ - Set the DataFrame index using existing columns. - - Set the DataFrame index (row labels) using one or more existing - columns or arrays (of the correct length). The index can replace the - existing index or expand on it. - - Parameters - ---------- - keys : label or array-like or list of labels/arrays - This parameter can be either a single column key, a single array of - the same length as the calling DataFrame, or a list containing an - arbitrary combination of column keys and arrays. Here, "array" - encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and - instances of :class:`~collections.abc.Iterator`. - drop : bool, default True - Delete columns to be used as the new index. - append : bool, default False - Whether to append columns to existing index. - inplace : bool, default False - If True, modifies the DataFrame in place (do not create a new object). - verify_integrity : bool, default False - Check the new index for duplicates. Otherwise defer the check until - necessary. Setting to False will improve the performance of this - method. - - Returns - ------- - DataFrame or None - Changed row labels or None if ``inplace=True``. - - See Also - -------- - DataFrame.reset_index : Opposite of set_index. - DataFrame.reindex : Change to new indices or expand indices. - DataFrame.reindex_like : Change to same indices as other DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'month': [1, 4, 7, 10], - ... 'year': [2012, 2014, 2013, 2014], - ... 'sale': [55, 40, 84, 31]}) - >>> df - month year sale - 0 1 2012 55 - 1 4 2014 40 - 2 7 2013 84 - 3 10 2014 31 - - Set the index to become the 'month' column: - - >>> df.set_index('month') - year sale - month - 1 2012 55 - 4 2014 40 - 7 2013 84 - 10 2014 31 - - Create a MultiIndex using columns 'year' and 'month': - - >>> df.set_index(['year', 'month']) - sale - year month - 2012 1 55 - 2014 4 40 - 2013 7 84 - 2014 10 31 - - Create a MultiIndex using an Index and a column: - - >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) - month sale - year - 1 2012 1 55 - 2 2014 4 40 - 3 2013 7 84 - 4 2014 10 31 - - Create a MultiIndex using two Series: - - >>> s = pd.Series([1, 2, 3, 4]) - >>> df.set_index([s, s**2]) - month year sale - 1 1 1 2012 55 - 2 4 4 2014 40 - 3 9 7 2013 84 - 4 16 10 2014 31 - """ - inplace = validate_bool_kwarg(inplace, "inplace") - self._check_inplace_and_allows_duplicate_labels(inplace) - if not isinstance(keys, list): - keys = [keys] - - err_msg = ( - 'The parameter "keys" may be a column key, one-dimensional ' - "array, or a list containing only valid column keys and " - "one-dimensional arrays." - ) - - missing: list[Hashable] = [] - for col in keys: - if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): - # arrays are fine as long as they are one-dimensional - # iterators get converted to list below - if getattr(col, "ndim", 1) != 1: - raise ValueError(err_msg) - else: - # everything else gets tried as a key; see GH 24969 - try: - found = col in self.columns - except TypeError as err: - raise TypeError( - f"{err_msg}. Received column of type {type(col)}" - ) from err - else: - if not found: - missing.append(col) - - if missing: - raise KeyError(f"None of {missing} are in the columns") - - if inplace: - frame = self - else: - frame = self.copy() - - arrays = [] - names: list[Hashable] = [] - if append: - names = list(self.index.names) - if isinstance(self.index, MultiIndex): - for i in range(self.index.nlevels): - arrays.append(self.index._get_level_values(i)) - else: - arrays.append(self.index) - - to_remove: list[Hashable] = [] - for col in keys: - if isinstance(col, MultiIndex): - for n in range(col.nlevels): - arrays.append(col._get_level_values(n)) - names.extend(col.names) - elif isinstance(col, (Index, Series)): - # if Index then not MultiIndex (treated above) - - # error: Argument 1 to "append" of "list" has incompatible type - # "Union[Index, Series]"; expected "Index" - arrays.append(col) # type:ignore[arg-type] - names.append(col.name) - elif isinstance(col, (list, np.ndarray)): - # error: Argument 1 to "append" of "list" has incompatible type - # "Union[List[Any], ndarray]"; expected "Index" - arrays.append(col) # type: ignore[arg-type] - names.append(None) - elif isinstance(col, abc.Iterator): - # error: Argument 1 to "append" of "list" has incompatible type - # "List[Any]"; expected "Index" - arrays.append(list(col)) # type: ignore[arg-type] - names.append(None) - # from here, col can only be a column label - else: - arrays.append(frame[col]._values) - names.append(col) - if drop: - to_remove.append(col) - - if len(arrays[-1]) != len(self): - # check newest element against length of calling frame, since - # ensure_index_from_sequences would not raise for append=False. - raise ValueError( - f"Length mismatch: Expected {len(self)} rows, " - f"received array of length {len(arrays[-1])}" - ) - - index = ensure_index_from_sequences(arrays, names) - - if verify_integrity and not index.is_unique: - duplicates = index[index.duplicated()].unique() - raise ValueError(f"Index has duplicate keys: {duplicates}") - - # use set to handle duplicate column names gracefully in case of drop - for c in set(to_remove): - del frame[c] - - # clear up memory usage - index._cleanup() - - frame.index = index - - if not inplace: - return frame - - @overload - def reset_index( - self, - level: Hashable | Sequence[Hashable] | None = ..., - drop: bool = ..., - inplace: Literal[False] = ..., - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] = None, - ) -> DataFrame: - ... - - @overload - def reset_index( - self, - level: Hashable | Sequence[Hashable] | None, - drop: bool, - inplace: Literal[True], - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] = None, - ) -> None: - ... - - @overload - def reset_index( - self, - *, - drop: bool, - inplace: Literal[True], - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] = None, - ) -> None: - ... - - @overload - def reset_index( - self, - level: Hashable | Sequence[Hashable] | None, - *, - inplace: Literal[True], - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] = None, - ) -> None: - ... - - @overload - def reset_index( - self, - *, - inplace: Literal[True], - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] = None, - ) -> None: - ... - - @overload - def reset_index( - self, - level: Hashable | Sequence[Hashable] | None = ..., - drop: bool = ..., - inplace: bool = ..., - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] = None, - ) -> DataFrame | None: - ... - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) - @validate_bool_kwargs_from_keywords('inplace', 'drop') - def reset_index( - self, - level: Hashable | Sequence[Hashable] | None = None, - drop: bool = False, - inplace: bool = False, - col_level: Hashable = 0, - col_fill: Hashable = "", - allow_duplicates: bool | lib.NoDefault = lib.no_default, - names: Hashable | Sequence[Hashable] = None, - ) -> DataFrame | None: - """ - Reset the index, or a level of it. - - Reset the index of the DataFrame, and use the default one instead. - If the DataFrame has a MultiIndex, this method can remove one or more - levels. - - Parameters - ---------- - level : int, str, tuple, or list, default None - Only remove the given levels from the index. Removes all levels by - default. - drop : bool, default False - Do not try to insert index into dataframe columns. This resets - the index to the default integer index. - inplace : bool, default False - Modify the DataFrame in place (do not create a new object). - col_level : int or str, default 0 - If the columns have multiple levels, determines which level the - labels are inserted into. By default it is inserted into the first - level. - col_fill : object, default '' - If the columns have multiple levels, determines how the other - levels are named. If None then the index name is repeated. - allow_duplicates : bool, optional, default lib.no_default - Allow duplicate column labels to be created. - - .. versionadded:: 1.5.0 - - names : int, str or 1-dimensional list, default None - Using the given string, rename the DataFrame column which contains the - index data. If the DataFrame has a MultiIndex, this has to be a list or - tuple with length equal to the number of levels. - - .. versionadded:: 1.5.0 - - Returns - ------- - DataFrame or None - DataFrame with the new index or None if ``inplace=True``. - - See Also - -------- - DataFrame.set_index : Opposite of reset_index. - DataFrame.reindex : Change to new indices or expand indices. - DataFrame.reindex_like : Change to same indices as other DataFrame. - - Examples - -------- - >>> df = pd.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), - ... ('mammal', np.nan)], - ... index=['falcon', 'parrot', 'lion', 'monkey'], - ... columns=('class', 'max_speed')) - >>> df - class max_speed - falcon bird 389.0 - parrot bird 24.0 - lion mammal 80.5 - monkey mammal NaN - - When we reset the index, the old index is added as a column, and a - new sequential index is used: - - >>> df.reset_index() - index class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal NaN - - We can use the `drop` parameter to avoid the old index being added as - a column: - - >>> df.reset_index(drop=True) - class max_speed - 0 bird 389.0 - 1 bird 24.0 - 2 mammal 80.5 - 3 mammal NaN - - You can also use `reset_index` with `MultiIndex`. - - >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), - ... ('bird', 'parrot'), - ... ('mammal', 'lion'), - ... ('mammal', 'monkey')], - ... names=['class', 'name']) - >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), - ... ('species', 'type')]) - >>> df = pd.DataFrame([(389.0, 'fly'), - ... ( 24.0, 'fly'), - ... ( 80.5, 'run'), - ... (np.nan, 'jump')], - ... index=index, - ... columns=columns) - >>> df - speed species - max type - class name - bird falcon 389.0 fly - parrot 24.0 fly - mammal lion 80.5 run - monkey NaN jump - - Using the `names` parameter, choose a name for the index column: - - >>> df.reset_index(names=['classes', 'names']) - classes names speed species - max type - 0 bird falcon 389.0 fly - 1 bird parrot 24.0 fly - 2 mammal lion 80.5 run - 3 mammal monkey NaN jump - - If the index has multiple levels, we can reset a subset of them: - - >>> df.reset_index(level='class') - class speed species - max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump - - If we are not dropping the index, by default, it is placed in the top - level. We can place it in another level: - - >>> df.reset_index(level='class', col_level=1) - speed species - class max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump - - When the index is inserted under another level, we can specify under - which one with the parameter `col_fill`: - - >>> df.reset_index(level='class', col_level=1, col_fill='species') - species speed species - class max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump - - If we specify a nonexistent level for `col_fill`, it is created: - - >>> df.reset_index(level='class', col_level=1, col_fill='genus') - genus speed species - class max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump - """ - inplace = validate_bool_kwarg(inplace, "inplace") - self._check_inplace_and_allows_duplicate_labels(inplace) - if inplace: - new_obj = self - else: - new_obj = self.copy() - if allow_duplicates is not lib.no_default: - allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") - - new_index = default_index(len(new_obj)) - if level is not None: - if not isinstance(level, (tuple, list)): - level = [level] - level = [self.index._get_level_number(lev) for lev in level] - if len(level) < self.index.nlevels: - new_index = self.index.droplevel(level) - - if not drop: - to_insert: Iterable[tuple[Any, Any | None]] - - default = "index" if "index" not in self else "level_0" - names = self.index._get_default_index_names(names, default) - - if isinstance(self.index, MultiIndex): - to_insert = zip(self.index.levels, self.index.codes) - else: - to_insert = ((self.index, None),) - - multi_col = isinstance(self.columns, MultiIndex) - for i, (lev, lab) in reversed(list(enumerate(to_insert))): - if level is not None and i not in level: - continue - name = names[i] - if multi_col: - col_name = list(name) if isinstance(name, tuple) else [name] - if col_fill is None: - if len(col_name) not in (1, self.columns.nlevels): - raise ValueError( - "col_fill=None is incompatible " - f"with incomplete column name {name}" - ) - col_fill = col_name[0] - - lev_num = self.columns._get_level_number(col_level) - name_lst = [col_fill] * lev_num + col_name - missing = self.columns.nlevels - len(name_lst) - name_lst += [col_fill] * missing - name = tuple(name_lst) - - # to ndarray and maybe infer different dtype - level_values = lev._values - if level_values.dtype == np.object_: - level_values = lib.maybe_convert_objects(level_values) - - if lab is not None: - # if we have the codes, extract the values with a mask - level_values = algorithms.take( - level_values, lab, allow_fill=True, fill_value=lev._na_value - ) - - new_obj.insert( - 0, - name, - level_values, - allow_duplicates=allow_duplicates, - ) - - new_obj.index = new_index - if not inplace: - return new_obj - - return None - - # ---------------------------------------------------------------------- - # Reindex-based selection methods - - @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isna(self) -> DataFrame: - result = self._constructor(self._mgr.isna(func=isna)) - return result.__finalize__(self, method="isna") - - @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isnull(self) -> DataFrame: - """ - DataFrame.isnull is an alias for DataFrame.isna. - """ - return self.isna() - - @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notna(self) -> DataFrame: - return ~self.isna() - - @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notnull(self) -> DataFrame: - """ - DataFrame.notnull is an alias for DataFrame.notna. - """ - return ~self.isna() - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) - @validate_bool_kwargs_from_keywords('inplace') - def dropna( - self, - axis: Axis = 0, - how: str | NoDefault = no_default, - thresh: int | NoDefault = no_default, - subset: IndexLabel = None, - inplace: bool = False, - ): - """ - Remove missing values. - - See the :ref:`User Guide ` for more on which values are - considered missing, and how to work with missing data. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - Determine if rows or columns which contain missing values are - removed. - - * 0, or 'index' : Drop rows which contain missing values. - * 1, or 'columns' : Drop columns which contain missing value. - - .. versionchanged:: 1.0.0 - - Pass tuple or list to drop on multiple axes. - Only a single axis is allowed. - - how : {'any', 'all'}, default 'any' - Determine if row or column is removed from DataFrame, when we have - at least one NA or all NA. - - * 'any' : If any NA values are present, drop that row or column. - * 'all' : If all values are NA, drop that row or column. - - thresh : int, optional - Require that many non-NA values. Cannot be combined with how. - subset : column label or sequence of labels, optional - Labels along other axis to consider, e.g. if you are dropping rows - these would be a list of columns to include. - inplace : bool, default False - If True, do operation inplace and return None. - - Returns - ------- - DataFrame or None - DataFrame with NA entries dropped from it or None if ``inplace=True``. - - See Also - -------- - DataFrame.isna: Indicate missing values. - DataFrame.notna : Indicate existing (non-missing) values. - DataFrame.fillna : Replace missing values. - Series.dropna : Drop missing values. - Index.dropna : Drop missing indices. - - Examples - -------- - >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], - ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), - ... pd.NaT]}) - >>> df - name toy born - 0 Alfred NaN NaT - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT - - Drop the rows where at least one element is missing. - - >>> df.dropna() - name toy born - 1 Batman Batmobile 1940-04-25 - - Drop the columns where at least one element is missing. - - >>> df.dropna(axis='columns') - name - 0 Alfred - 1 Batman - 2 Catwoman - - Drop the rows where all elements are missing. - - >>> df.dropna(how='all') - name toy born - 0 Alfred NaN NaT - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT - - Keep only the rows with at least 2 non-NA values. - - >>> df.dropna(thresh=2) - name toy born - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT - - Define in which columns to look for missing values. - - >>> df.dropna(subset=['name', 'toy']) - name toy born - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT - - Keep the DataFrame with valid entries in the same variable. - - >>> df.dropna(inplace=True) - >>> df - name toy born - 1 Batman Batmobile 1940-04-25 - """ - if (how is not no_default) and (thresh is not no_default): - raise TypeError( - "You cannot set both the how and thresh arguments at the same time." - ) - - if how is no_default: - how = "any" - - inplace = validate_bool_kwarg(inplace, "inplace") - if isinstance(axis, (tuple, list)): - # GH20987 - raise TypeError("supplying multiple axes to axis is no longer supported.") - - axis = self._get_axis_number(axis) - agg_axis = 1 - axis - - agg_obj = self - if subset is not None: - # subset needs to be list - if not is_list_like(subset): - subset = [subset] - ax = self._get_axis(agg_axis) - indices = ax.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(np.array(subset)[check].tolist()) - agg_obj = self.take(indices, axis=agg_axis) - - if thresh is not no_default: - count = agg_obj.count(axis=agg_axis) - mask = count >= thresh - elif how == "any": - # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' - mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) - elif how == "all": - # faster equivalent to 'agg_obj.count(agg_axis) > 0' - mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) - else: - if how is not no_default: - raise ValueError(f"invalid how option: {how}") - - if np.all(mask): - result = self.copy() - else: - result = self.loc(axis=axis)[mask] - - if inplace: - self._update_inplace(result) - else: - return result - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) - @validate_bool_kwargs_from_keywords('inplace', 'ignore_index') - def drop_duplicates( - self, - subset: Hashable | Sequence[Hashable] | None = None, - keep: Literal["first"] | Literal["last"] | Literal[False] = "first", - inplace: bool = False, - ignore_index: bool = False, - ) -> DataFrame | None: - """ - Return DataFrame with duplicate rows removed. - - Considering certain columns is optional. Indexes, including time indexes - are ignored. - - Parameters - ---------- - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', False}, default 'first' - Determines which duplicates (if any) to keep. - - ``first`` : Drop duplicates except for the first occurrence. - - ``last`` : Drop duplicates except for the last occurrence. - - False : Drop all duplicates. - inplace : bool, default False - Whether to drop duplicates in place or to return a copy. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. - - .. versionadded:: 1.0.0 - - Returns - ------- - DataFrame or None - DataFrame with duplicates removed or None if ``inplace=True``. - - See Also - -------- - DataFrame.value_counts: Count unique combinations of columns. - - Examples - -------- - Consider dataset containing ramen rating. - - >>> df = pd.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - By default, it removes duplicate rows based on all columns. - - >>> df.drop_duplicates() - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - To remove duplicates on specific column(s), use ``subset``. - - >>> df.drop_duplicates(subset=['brand']) - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - - To remove duplicates and keep last occurrences, use ``keep``. - - >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') - brand style rating - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 4 Indomie pack 5.0 - """ - if self.empty: - return self.copy() - - inplace = validate_bool_kwarg(inplace, "inplace") - ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") - duplicated = self.duplicated(subset, keep=keep) - - result = self[-duplicated] - if ignore_index: - result.index = default_index(len(result)) - - if inplace: - self._update_inplace(result) - return None - else: - return result - - def duplicated( - self, - subset: Hashable | Sequence[Hashable] | None = None, - keep: Literal["first"] | Literal["last"] | Literal[False] = "first", - ) -> Series: - """ - Return boolean Series denoting duplicate rows. - - Considering certain columns is optional. - - Parameters - ---------- - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', False}, default 'first' - Determines which duplicates (if any) to mark. - - - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - - False : Mark all duplicates as ``True``. - - Returns - ------- - Series - Boolean series for each duplicated rows. - - See Also - -------- - Index.duplicated : Equivalent method on index. - Series.duplicated : Equivalent method on Series. - Series.drop_duplicates : Remove duplicate values from Series. - DataFrame.drop_duplicates : Remove duplicate values from DataFrame. - - Examples - -------- - Consider dataset containing ramen rating. - - >>> df = pd.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - By default, for each set of duplicated values, the first occurrence - is set on False and all others on True. - - >>> df.duplicated() - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True. - - >>> df.duplicated(keep='last') - 0 True - 1 False - 2 False - 3 False - 4 False - dtype: bool - - By setting ``keep`` on False, all duplicates are True. - - >>> df.duplicated(keep=False) - 0 True - 1 True - 2 False - 3 False - 4 False - dtype: bool - - To find duplicates on specific column(s), use ``subset``. - - >>> df.duplicated(subset=['brand']) - 0 False - 1 True - 2 False - 3 True - 4 True - dtype: bool - """ - - if self.empty: - return self._constructor_sliced(dtype=bool) - - def f(vals) -> tuple[np.ndarray, int]: - labels, shape = algorithms.factorize(vals, size_hint=len(self)) - return labels.astype("i8", copy=False), len(shape) - - if subset is None: - # https://github.com/pandas-dev/pandas/issues/28770 - # Incompatible types in assignment (expression has type "Index", variable - # has type "Sequence[Any]") - subset = self.columns # type: ignore[assignment] - elif ( - not np.iterable(subset) - or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self.columns - ): - subset = (subset,) - - # needed for mypy since can't narrow types using np.iterable - subset = cast(Sequence, subset) - - # Verify all columns in subset exist in the queried dataframe - # Otherwise, raise a KeyError, same as if you try to __getitem__ with a - # key that doesn't exist. - diff = set(subset) - set(self.columns) - if diff: - raise KeyError(Index(diff)) - - if len(subset) == 1 and self.columns.is_unique: - # GH#45236 This is faster than get_group_index below - result = self[subset[0]].duplicated(keep) - result.name = None - else: - vals = (col.values for name, col in self.items() if name in subset) - labels, shape = map(list, zip(*map(f, vals))) - - ids = get_group_index( - labels, - # error: Argument 1 to "tuple" has incompatible type "List[_T]"; - # expected "Iterable[int]" - tuple(shape), # type: ignore[arg-type] - sort=False, - xnull=False, - ) - result = self._constructor_sliced(duplicated(ids, keep), index=self.index) - return result.__finalize__(self, method="duplicated") - - # ---------------------------------------------------------------------- - # Sorting - # TODO: Just move the sort_values doc here. - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"]) - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.sort_values.__doc__) - @validate_bool_kwargs_from_keywords('inplace', 'ignore_index') - # error: Signature of "sort_values" incompatible with supertype "NDFrame" - def sort_values( # type: ignore[override] - self, - by, - axis: Axis = 0, - ascending=True, - inplace: bool = False, - kind: str = "quicksort", - na_position: str = "last", - ignore_index: bool = False, - key: ValueKeyFunc = None, - ): - inplace = validate_bool_kwarg(inplace, "inplace") - axis = self._get_axis_number(axis) - ascending = validate_ascending(ascending) - if not isinstance(by, list): - by = [by] - if is_sequence(ascending) and len(by) != len(ascending): - raise ValueError( - f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" - ) - if len(by) > 1: - - keys = [self._get_label_or_level_values(x, axis=axis) for x in by] - - # need to rewrap columns in Series to apply key function - if key is not None: - # error: List comprehension has incompatible type List[Series]; - # expected List[ndarray] - keys = [ - Series(k, name=name) # type: ignore[misc] - for (k, name) in zip(keys, by) - ] - - indexer = lexsort_indexer( - keys, orders=ascending, na_position=na_position, key=key - ) - elif len(by): - # len(by) == 1 - - by = by[0] - k = self._get_label_or_level_values(by, axis=axis) - - # need to rewrap column in Series to apply key function - if key is not None: - # error: Incompatible types in assignment (expression has type - # "Series", variable has type "ndarray") - k = Series(k, name=by) # type: ignore[assignment] - - if isinstance(ascending, (tuple, list)): - ascending = ascending[0] - - indexer = nargsort( - k, kind=kind, ascending=ascending, na_position=na_position, key=key - ) - else: - return self.copy() - - new_data = self._mgr.take( - indexer, axis=self._get_block_manager_axis(axis), verify=False - ) - - if ignore_index: - new_data.set_axis( - self._get_block_manager_axis(axis), default_index(len(indexer)) - ) - - result = self._constructor(new_data) - if inplace: - return self._update_inplace(result) - else: - return result.__finalize__(self, method="sort_values") - - @overload - def sort_index( - self, - *, - axis: Axis = ..., - level: Level | None = ..., - ascending: bool | Sequence[bool] = ..., - inplace: Literal[True], - kind: SortKind = ..., - na_position: NaPosition = ..., - sort_remaining: bool = ..., - ignore_index: bool = ..., - key: IndexKeyFunc = ..., - ) -> None: - ... - - @overload - def sort_index( - self, - *, - axis: Axis = ..., - level: Level | None = ..., - ascending: bool | Sequence[bool] = ..., - inplace: Literal[False] = ..., - kind: SortKind = ..., - na_position: NaPosition = ..., - sort_remaining: bool = ..., - ignore_index: bool = ..., - key: IndexKeyFunc = ..., - ) -> DataFrame: - ... - - @overload - def sort_index( - self, - *, - axis: Axis = ..., - level: Level | None = ..., - ascending: bool | Sequence[bool] = ..., - inplace: bool = ..., - kind: SortKind = ..., - na_position: NaPosition = ..., - sort_remaining: bool = ..., - ignore_index: bool = ..., - key: IndexKeyFunc = ..., - ) -> DataFrame | None: - ... - - # error: Signature of "sort_index" incompatible with supertype "NDFrame" - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) - @validate_bool_kwargs_from_keywords('inplace', 'sort_remaining', 'ignore_index') - def sort_index( # type: ignore[override] - self, - axis: Axis = 0, - level: Level | None = None, - ascending: bool | Sequence[bool] = True, - inplace: bool = False, - kind: SortKind = "quicksort", - na_position: NaPosition = "last", - sort_remaining: bool = True, - ignore_index: bool = False, - key: IndexKeyFunc = None, - ) -> DataFrame | None: - """ - Sort object by labels (along an axis). - - Returns a new DataFrame sorted by label if `inplace` argument is - ``False``, otherwise updates the original DataFrame and returns None. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis along which to sort. The value 0 identifies the rows, - and 1 identifies the columns. - level : int or level name or list of ints or list of level names - If not None, sort on values in specified index level(s). - ascending : bool or list-like of bools, default True - Sort ascending vs. descending. When the index is a MultiIndex the - sort direction can be controlled for each level individually. - inplace : bool, default False - If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' - Choice of sorting algorithm. See also :func:`numpy.sort` for more - information. `mergesort` and `stable` are the only stable algorithms. For - DataFrames, this option is only applied when sorting on a single - column or label. - na_position : {'first', 'last'}, default 'last' - Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. - Not implemented for MultiIndex. - sort_remaining : bool, default True - If True and sorting by level and index is multilevel, sort by other - levels too (in order) after sorting by specified level. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. - - .. versionadded:: 1.0.0 - - key : callable, optional - If not None, apply the key function to the index values - before sorting. This is similar to the `key` argument in the - builtin :meth:`sorted` function, with the notable difference that - this `key` function should be *vectorized*. It should expect an - ``Index`` and return an ``Index`` of the same shape. For MultiIndex - inputs, the key is applied *per level*. - - .. versionadded:: 1.1.0 - - Returns - ------- - DataFrame or None - The original DataFrame sorted by the labels or None if ``inplace=True``. - - See Also - -------- - Series.sort_index : Sort Series by the index. - DataFrame.sort_values : Sort DataFrame by the value. - Series.sort_values : Sort Series by the value. - - Examples - -------- - >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], - ... columns=['A']) - >>> df.sort_index() - A - 1 4 - 29 2 - 100 1 - 150 5 - 234 3 - - By default, it sorts in ascending order, to sort in descending order, - use ``ascending=False`` - - >>> df.sort_index(ascending=False) - A - 234 3 - 150 5 - 100 1 - 29 2 - 1 4 - - A key function can be specified which is applied to the index before - sorting. For a ``MultiIndex`` this is applied to each level separately. - - >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) - >>> df.sort_index(key=lambda x: x.str.lower()) - a - A 1 - b 2 - C 3 - d 4 - """ - return super().sort_index( - axis=axis, - level=level, - ascending=ascending, - inplace=inplace, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - ignore_index=ignore_index, - key=key, - ) - - @validate_bool_kwargs_from_keywords('normalize', 'sort', 'ascending', 'dropna') - def value_counts( - self, - subset: Sequence[Hashable] | None = None, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - dropna: bool = True, - ): - """ - Return a Series containing counts of unique rows in the DataFrame. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - subset : list-like, optional - Columns to use when counting unique combinations. - normalize : bool, default False - Return proportions rather than frequencies. - sort : bool, default True - Sort by frequencies. - ascending : bool, default False - Sort in ascending order. - dropna : bool, default True - Don’t include counts of rows that contain NA values. - - .. versionadded:: 1.3.0 - - Returns - ------- - Series - - See Also - -------- - Series.value_counts: Equivalent method on Series. - - Notes - ----- - The returned Series will have a MultiIndex with one level per input - column. By default, rows that contain any NA values are omitted from - the result. By default, the resulting Series will be in descending - order so that the first element is the most frequently-occurring row. - - Examples - -------- - >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], - ... 'num_wings': [2, 0, 0, 0]}, - ... index=['falcon', 'dog', 'cat', 'ant']) - >>> df - num_legs num_wings - falcon 2 2 - dog 4 0 - cat 4 0 - ant 6 0 - - >>> df.value_counts() - num_legs num_wings - 4 0 2 - 2 2 1 - 6 0 1 - dtype: int64 - - >>> df.value_counts(sort=False) - num_legs num_wings - 2 2 1 - 4 0 2 - 6 0 1 - dtype: int64 - - >>> df.value_counts(ascending=True) - num_legs num_wings - 2 2 1 - 6 0 1 - 4 0 2 - dtype: int64 - - >>> df.value_counts(normalize=True) - num_legs num_wings - 4 0 0.50 - 2 2 0.25 - 6 0 0.25 - dtype: float64 - - With `dropna` set to `False` we can also count rows with NA values. - - >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], - ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) - >>> df - first_name middle_name - 0 John Smith - 1 Anne - 2 John - 3 Beth Louise - - >>> df.value_counts() - first_name middle_name - Beth Louise 1 - John Smith 1 - dtype: int64 - - >>> df.value_counts(dropna=False) - first_name middle_name - Anne NaN 1 - Beth Louise 1 - John Smith 1 - NaN 1 - dtype: int64 - """ - if subset is None: - subset = self.columns.tolist() - - counts = self.groupby(subset, dropna=dropna).grouper.size() - - if sort: - counts = counts.sort_values(ascending=ascending) - if normalize: - counts /= counts.sum() - - # Force MultiIndex for single column - if len(subset) == 1: - counts.index = MultiIndex.from_arrays( - [counts.index], names=[counts.index.name] - ) - - return counts - - def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: - """ - Return the first `n` rows ordered by `columns` in descending order. - - Return the first `n` rows with the largest values in `columns`, in - descending order. The columns that are not specified are returned as - well, but not used for ordering. - - This method is equivalent to - ``df.sort_values(columns, ascending=False).head(n)``, but more - performant. - - Parameters - ---------- - n : int - Number of rows to return. - columns : label or list of labels - Column label(s) to order by. - keep : {'first', 'last', 'all'}, default 'first' - Where there are duplicate values: - - - ``first`` : prioritize the first occurrence(s) - - ``last`` : prioritize the last occurrence(s) - - ``all`` : do not drop any duplicates, even it means - selecting more than `n` items. - - Returns - ------- - DataFrame - The first `n` rows ordered by the given columns in descending - order. - - See Also - -------- - DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in - ascending order. - DataFrame.sort_values : Sort DataFrame by the values. - DataFrame.head : Return the first `n` rows without re-ordering. - - Notes - ----- - This function cannot be used with all column types. For example, when - specifying columns with `object` or `category` dtypes, ``TypeError`` is - raised. - - Examples - -------- - >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 11300, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) - >>> df - population GDP alpha-2 - Italy 59000000 1937894 IT - France 65000000 2583560 FR - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN - Iceland 337000 17036 IS - Nauru 11300 182 NR - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - - In the following example, we will use ``nlargest`` to select the three - rows having the largest values in column "population". - - >>> df.nlargest(3, 'population') - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Malta 434000 12011 MT - - When using ``keep='last'``, ties are resolved in reverse order: - - >>> df.nlargest(3, 'population', keep='last') - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Brunei 434000 12128 BN - - When using ``keep='all'``, all duplicate items are maintained: - - >>> df.nlargest(3, 'population', keep='all') - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN - - To order by the largest values in column "population" and then "GDP", - we can specify multiple columns like in the next example. - - >>> df.nlargest(3, ['population', 'GDP']) - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Brunei 434000 12128 BN - """ - return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - - def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: - """ - Return the first `n` rows ordered by `columns` in ascending order. - - Return the first `n` rows with the smallest values in `columns`, in - ascending order. The columns that are not specified are returned as - well, but not used for ordering. - - This method is equivalent to - ``df.sort_values(columns, ascending=True).head(n)``, but more - performant. - - Parameters - ---------- - n : int - Number of items to retrieve. - columns : list or str - Column name or names to order by. - keep : {'first', 'last', 'all'}, default 'first' - Where there are duplicate values: - - - ``first`` : take the first occurrence. - - ``last`` : take the last occurrence. - - ``all`` : do not drop any duplicates, even it means - selecting more than `n` items. - - Returns - ------- - DataFrame - - See Also - -------- - DataFrame.nlargest : Return the first `n` rows ordered by `columns` in - descending order. - DataFrame.sort_values : Sort DataFrame by the values. - DataFrame.head : Return the first `n` rows without re-ordering. - - Examples - -------- - >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 337000, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) - >>> df - population GDP alpha-2 - Italy 59000000 1937894 IT - France 65000000 2583560 FR - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN - Iceland 337000 17036 IS - Nauru 337000 182 NR - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - - In the following example, we will use ``nsmallest`` to select the - three rows having the smallest values in column "population". - - >>> df.nsmallest(3, 'population') - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Iceland 337000 17036 IS - - When using ``keep='last'``, ties are resolved in reverse order: - - >>> df.nsmallest(3, 'population', keep='last') - population GDP alpha-2 - Anguilla 11300 311 AI - Tuvalu 11300 38 TV - Nauru 337000 182 NR - - When using ``keep='all'``, all duplicate items are maintained: - - >>> df.nsmallest(3, 'population', keep='all') - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Iceland 337000 17036 IS - Nauru 337000 182 NR - - To order by the smallest values in column "population" and then "GDP", we can - specify multiple columns like in the next example. - - >>> df.nsmallest(3, ['population', 'GDP']) - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Nauru 337000 182 NR - """ - return algorithms.SelectNFrame( - self, n=n, keep=keep, columns=columns - ).nsmallest() - - @doc( - Series.swaplevel, - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise.""" - ), - examples=dedent( - """\ - Examples - -------- - >>> df = pd.DataFrame( - ... {"Grade": ["A", "B", "A", "C"]}, - ... index=[ - ... ["Final exam", "Final exam", "Coursework", "Coursework"], - ... ["History", "Geography", "History", "Geography"], - ... ["January", "February", "March", "April"], - ... ], - ... ) - >>> df - Grade - Final exam History January A - Geography February B - Coursework History March A - Geography April C - - In the following example, we will swap the levels of the indices. - Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. - By not supplying any arguments for i and j, we swap the last and second to - last indices. - - >>> df.swaplevel() - Grade - Final exam January History A - February Geography B - Coursework March History A - April Geography C - - By supplying one argument, we can choose which index to swap the last - index with. We can for example swap the first index with the last one as - follows. - - >>> df.swaplevel(0) - Grade - January History Final exam A - February Geography Final exam B - March History Coursework A - April Geography Coursework C - - We can also define explicitly which indices we want to swap by supplying values - for both i and j. Here, we for example swap the first and second indices. - - >>> df.swaplevel(0, 1) - Grade - History Final exam January A - Geography Final exam February B - History Coursework March A - Geography Coursework April C""" - ), - ) - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: - result = self.copy() - - axis = self._get_axis_number(axis) - - if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover - raise TypeError("Can only swap levels on a hierarchical axis.") - - if axis == 0: - assert isinstance(result.index, MultiIndex) - result.index = result.index.swaplevel(i, j) - else: - assert isinstance(result.columns, MultiIndex) - result.columns = result.columns.swaplevel(i, j) - return result - - def reorder_levels(self, order: Sequence[Axis], axis: Axis = 0) -> DataFrame: - """ - Rearrange index levels using input order. May not drop or duplicate levels. - - Parameters - ---------- - order : list of int or list of str - List representing new level order. Reference level by number - (position) or by key (label). - axis : {0 or 'index', 1 or 'columns'}, default 0 - Where to reorder levels. - - Returns - ------- - DataFrame - - Examples - -------- - >>> data = { - ... "class": ["Mammals", "Mammals", "Reptiles"], - ... "diet": ["Omnivore", "Carnivore", "Carnivore"], - ... "species": ["Humans", "Dogs", "Snakes"], - ... } - >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) - >>> df = df.set_index(["class", "diet"]) - >>> df - species - class diet - Mammals Omnivore Humans - Carnivore Dogs - Reptiles Carnivore Snakes - - Let's reorder the levels of the index: - - >>> df.reorder_levels(["diet", "class"]) - species - diet class - Omnivore Mammals Humans - Carnivore Mammals Dogs - Reptiles Snakes - """ - axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover - raise TypeError("Can only reorder levels on a hierarchical axis.") - - result = self.copy() - - if axis == 0: - assert isinstance(result.index, MultiIndex) - result.index = result.index.reorder_levels(order) - else: - assert isinstance(result.columns, MultiIndex) - result.columns = result.columns.reorder_levels(order) - return result - - # ---------------------------------------------------------------------- - # Arithmetic Methods - - def _cmp_method(self, other, op): - axis = 1 # only relevant for Series other case - - self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) - - # See GH#4537 for discussion of scalar op behavior - new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data) - - def _arith_method(self, other, op): - if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None): - return ops.frame_arith_method_with_reindex(self, other, op) - - axis = 1 # only relevant for Series other case - other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) - - self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) - - new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data) - - _logical_method = _arith_method - - def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): - """ - Evaluate the frame operation func(left, right) by evaluating - column-by-column, dispatching to the Series implementation. - - Parameters - ---------- - right : scalar, Series, or DataFrame - func : arithmetic or comparison operator - axis : {None, 0, 1} - - Returns - ------- - DataFrame - """ - # Get the appropriate array-op to apply to each column/block's values. - array_op = ops.get_array_op(func) - - right = lib.item_from_zerodim(right) - if not is_list_like(right): - # i.e. scalar, faster than checking np.ndim(right) == 0 - with np.errstate(all="ignore"): - bm = self._mgr.apply(array_op, right=right) - return self._constructor(bm) - - elif isinstance(right, DataFrame): - assert self.index.equals(right.index) - assert self.columns.equals(right.columns) - # TODO: The previous assertion `assert right._indexed_same(self)` - # fails in cases with empty columns reached via - # _frame_arith_method_with_reindex - - # TODO operate_blockwise expects a manager of the same type - with np.errstate(all="ignore"): - bm = self._mgr.operate_blockwise( - # error: Argument 1 to "operate_blockwise" of "ArrayManager" has - # incompatible type "Union[ArrayManager, BlockManager]"; expected - # "ArrayManager" - # error: Argument 1 to "operate_blockwise" of "BlockManager" has - # incompatible type "Union[ArrayManager, BlockManager]"; expected - # "BlockManager" - right._mgr, # type: ignore[arg-type] - array_op, - ) - return self._constructor(bm) - - elif isinstance(right, Series) and axis == 1: - # axis=1 means we want to operate row-by-row - assert right.index.equals(self.columns) - - right = right._values - # maybe_align_as_frame ensures we do not have an ndarray here - assert not isinstance(right, np.ndarray) - - with np.errstate(all="ignore"): - arrays = [ - array_op(_left, _right) - for _left, _right in zip(self._iter_column_arrays(), right) - ] - - elif isinstance(right, Series): - assert right.index.equals(self.index) # Handle other cases later - right = right._values - - with np.errstate(all="ignore"): - arrays = [array_op(left, right) for left in self._iter_column_arrays()] - - else: - # Remaining cases have less-obvious dispatch rules - raise NotImplementedError(right) - - return type(self)._from_arrays( - arrays, self.columns, self.index, verify_integrity=False - ) - - def _combine_frame(self, other: DataFrame, func, fill_value=None): - # at this point we have `self._indexed_same(other)` - - if fill_value is None: - # since _arith_op may be called in a loop, avoid function call - # overhead if possible by doing this check once - _arith_op = func - - else: - - def _arith_op(left, right): - # for the mixed_type case where we iterate over columns, - # _arith_op(left, right) is equivalent to - # left._binop(right, func, fill_value=fill_value) - left, right = ops.fill_binop(left, right, fill_value) - return func(left, right) - - new_data = self._dispatch_frame_op(other, _arith_op) - return new_data - - def _construct_result(self, result) -> DataFrame: - """ - Wrap the result of an arithmetic, comparison, or logical operation. - - Parameters - ---------- - result : DataFrame - - Returns - ------- - DataFrame - """ - out = self._constructor(result, copy=False) - # Pin columns instead of passing to constructor for compat with - # non-unique columns case - out.columns = self.columns - out.index = self.index - return out - - def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: - # Naive implementation, room for optimization - div = self // other - mod = self - div * other - return div, mod - - def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: - # Naive implementation, room for optimization - div = other // self - mod = other - div * self - return div, mod - - # ---------------------------------------------------------------------- - # Combination-Related - - @doc( - _shared_docs["compare"], - """ -Returns -------- -DataFrame - DataFrame that shows the differences stacked side by side. - - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. - -Raises ------- -ValueError - When the two DataFrames don't have identical labels or shape. - -See Also --------- -Series.compare : Compare with another Series and show differences. -DataFrame.equals : Test whether two objects contain the same elements. - -Notes ------ -Matching NaNs will not appear as a difference. - -Can only compare identically-labeled -(i.e. same shape, identical row and column labels) DataFrames - -Examples --------- ->>> df = pd.DataFrame( -... {{ -... "col1": ["a", "a", "b", "b", "a"], -... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], -... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] -... }}, -... columns=["col1", "col2", "col3"], -... ) ->>> df - col1 col2 col3 -0 a 1.0 1.0 -1 a 2.0 2.0 -2 b 3.0 3.0 -3 b NaN 4.0 -4 a 5.0 5.0 - ->>> df2 = df.copy() ->>> df2.loc[0, 'col1'] = 'c' ->>> df2.loc[2, 'col3'] = 4.0 ->>> df2 - col1 col2 col3 -0 c 1.0 1.0 -1 a 2.0 2.0 -2 b 3.0 4.0 -3 b NaN 4.0 -4 a 5.0 5.0 - -Align the differences on columns - ->>> df.compare(df2) - col1 col3 - self other self other -0 a c NaN NaN -2 NaN NaN 3.0 4.0 - -Stack the differences on rows - ->>> df.compare(df2, align_axis=0) - col1 col3 -0 self a NaN - other c NaN -2 self NaN 3.0 - other NaN 4.0 - -Keep the equal values - ->>> df.compare(df2, keep_equal=True) - col1 col3 - self other self other -0 a c 1.0 1.0 -2 b b 3.0 4.0 - -Keep all original rows and columns - ->>> df.compare(df2, keep_shape=True) - col1 col2 col3 - self other self other self other -0 a c NaN NaN NaN NaN -1 NaN NaN NaN NaN NaN NaN -2 NaN NaN NaN NaN 3.0 4.0 -3 NaN NaN NaN NaN NaN NaN -4 NaN NaN NaN NaN NaN NaN - -Keep all original rows and columns and also all original values - ->>> df.compare(df2, keep_shape=True, keep_equal=True) - col1 col2 col3 - self other self other self other -0 a c 1.0 1.0 1.0 1.0 -1 a a 2.0 2.0 2.0 2.0 -2 b b 3.0 3.0 3.0 4.0 -3 b b NaN NaN 4.0 4.0 -4 a a 5.0 5.0 5.0 5.0 -""", - klass=_shared_doc_kwargs["klass"], - ) - @validate_bool_kwargs_from_keywords('keep_shape', 'keep_equal') - def compare( - self, - other: DataFrame, - align_axis: Axis = 1, - keep_shape: bool = False, - keep_equal: bool = False, - ) -> DataFrame: - return super().compare( - other=other, - align_axis=align_axis, - keep_shape=keep_shape, - keep_equal=keep_equal, - ) - - @validate_bool_kwargs_from_keywords('overwrite') - def combine( - self, other: DataFrame, func, fill_value=None, overwrite: bool = True - ) -> DataFrame: - """ - Perform column-wise combine with another DataFrame. - - Combines a DataFrame with `other` DataFrame using `func` - to element-wise combine columns. The row and column indexes of the - resulting DataFrame will be the union of the two. - - Parameters - ---------- - other : DataFrame - The DataFrame to merge column-wise. - func : function - Function that takes two series as inputs and return a Series or a - scalar. Used to merge the two dataframes column by columns. - fill_value : scalar value, default None - The value to fill NaNs with prior to passing any column to the - merge func. - overwrite : bool, default True - If True, columns in `self` that do not exist in `other` will be - overwritten with NaNs. - - Returns - ------- - DataFrame - Combination of the provided DataFrames. - - See Also - -------- - DataFrame.combine_first : Combine two DataFrame objects and default to - non-null values in frame calling the method. - - Examples - -------- - Combine using a simple function that chooses the smaller column. - - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) - >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 - >>> df1.combine(df2, take_smaller) - A B - 0 0 3 - 1 0 3 - - Example using a true element-wise combine function. - - >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) - >>> df1.combine(df2, np.minimum) - A B - 0 1 2 - 1 0 3 - - Using `fill_value` fills Nones prior to passing the column to the - merge function. - - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) - >>> df1.combine(df2, take_smaller, fill_value=-5) - A B - 0 0 -5.0 - 1 0 4.0 - - However, if the same element in both dataframes is None, that None - is preserved - - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) - >>> df1.combine(df2, take_smaller, fill_value=-5) - A B - 0 0 -5.0 - 1 0 3.0 - - Example that demonstrates the use of `overwrite` and behavior when - the axis differ between the dataframes. - - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) - >>> df1.combine(df2, take_smaller) - A B C - 0 NaN NaN NaN - 1 NaN 3.0 -10.0 - 2 NaN 3.0 1.0 - - >>> df1.combine(df2, take_smaller, overwrite=False) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 -10.0 - 2 NaN 3.0 1.0 - - Demonstrating the preference of the passed in dataframe. - - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) - >>> df2.combine(df1, take_smaller) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 NaN - 2 NaN 3.0 NaN - - >>> df2.combine(df1, take_smaller, overwrite=False) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 1.0 - 2 NaN 3.0 1.0 - """ - other_idxlen = len(other.index) # save for compare - - this, other = self.align(other, copy=False) - new_index = this.index - - if other.empty and len(new_index) == len(self.index): - return self.copy() - - if self.empty and len(other) == other_idxlen: - return other.copy() - - # sorts if possible - new_columns = this.columns.union(other.columns) - do_fill = fill_value is not None - result = {} - for col in new_columns: - series = this[col] - otherSeries = other[col] - - this_dtype = series.dtype - other_dtype = otherSeries.dtype - - this_mask = isna(series) - other_mask = isna(otherSeries) - - # don't overwrite columns unnecessarily - # DO propagate if this column is not in the intersection - if not overwrite and other_mask.all(): - result[col] = this[col].copy() - continue - - if do_fill: - series = series.copy() - otherSeries = otherSeries.copy() - series[this_mask] = fill_value - otherSeries[other_mask] = fill_value - - if col not in self.columns: - # If self DataFrame does not have col in other DataFrame, - # try to promote series, which is all NaN, as other_dtype. - new_dtype = other_dtype - try: - series = series.astype(new_dtype, copy=False) - except ValueError: - # e.g. new_dtype is integer types - pass - else: - # if we have different dtypes, possibly promote - new_dtype = find_common_type([this_dtype, other_dtype]) - series = series.astype(new_dtype, copy=False) - otherSeries = otherSeries.astype(new_dtype, copy=False) - - arr = func(series, otherSeries) - if isinstance(new_dtype, np.dtype): - # if new_dtype is an EA Dtype, then `func` is expected to return - # the correct dtype without any additional casting - arr = maybe_downcast_to_dtype(arr, new_dtype) - - result[col] = arr - - # convert_objects just in case - return self._constructor(result, index=new_index, columns=new_columns) - - def combine_first(self, other: DataFrame) -> DataFrame: - """ - Update null elements with value in the same location in `other`. - - Combine two DataFrame objects by filling null values in one DataFrame - with non-null values from other DataFrame. The row and column indexes - of the resulting DataFrame will be the union of the two. The resulting - dataframe contains the 'first' dataframe values and overrides the - second one values where both first.loc[index, col] and - second.loc[index, col] are not missing values, upon calling - first.combine_first(second). - - Parameters - ---------- - other : DataFrame - Provided DataFrame to use to fill null values. - - Returns - ------- - DataFrame - The result of combining the provided DataFrame with the other object. - - See Also - -------- - DataFrame.combine : Perform series-wise operation on two DataFrames - using a given function. - - Examples - -------- - >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) - >>> df1.combine_first(df2) - A B - 0 1.0 3.0 - 1 0.0 4.0 - - Null values still persist if the location of that null value - does not exist in `other` - - >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) - >>> df1.combine_first(df2) - A B C - 0 NaN 4.0 NaN - 1 0.0 3.0 1.0 - 2 NaN 3.0 1.0 - """ - import pandas.core.computation.expressions as expressions - - def combiner(x, y): - mask = extract_array(isna(x)) - - x_values = extract_array(x, extract_numpy=True) - y_values = extract_array(y, extract_numpy=True) - - # If the column y in other DataFrame is not in first DataFrame, - # just return y_values. - if y.name not in self.columns: - return y_values - - return expressions.where(mask, y_values, x_values) - - combined = self.combine(other, combiner, overwrite=False) - - dtypes = { - col: find_common_type([self.dtypes[col], other.dtypes[col]]) - for col in self.columns.intersection(other.columns) - if not is_dtype_equal(combined.dtypes[col], self.dtypes[col]) - } - - if dtypes: - combined = combined.astype(dtypes) - - return combined - - @validate_bool_kwargs_from_keywords('overwrite') - def update( - self, - other, - join: str = "left", - overwrite: bool = True, - filter_func=None, - errors: str = "ignore", - ) -> None: - """ - Modify in place using non-NA values from another DataFrame. - - Aligns on indices. There is no return value. - - Parameters - ---------- - other : DataFrame, or object coercible into a DataFrame - Should have at least one matching index/column label - with the original DataFrame. If a Series is passed, - its name attribute must be set, and that will be - used as the column name to align with the original DataFrame. - join : {'left'}, default 'left' - Only left join is implemented, keeping the index and columns of the - original object. - overwrite : bool, default True - How to handle non-NA values for overlapping keys: - - * True: overwrite original DataFrame's values - with values from `other`. - * False: only update values that are NA in - the original DataFrame. - - filter_func : callable(1d-array) -> bool 1d-array, optional - Can choose to replace values other than NA. Return True for values - that should be updated. - errors : {'raise', 'ignore'}, default 'ignore' - If 'raise', will raise a ValueError if the DataFrame and `other` - both contain non-NA data in the same place. - - Returns - ------- - None : method directly changes calling object - - Raises - ------ - ValueError - * When `errors='raise'` and there's overlapping non-NA data. - * When `errors` is not either `'ignore'` or `'raise'` - NotImplementedError - * If `join != 'left'` - - See Also - -------- - dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-column(s) operations. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, 5, 6], - ... 'C': [7, 8, 9]}) - >>> df.update(new_df) - >>> df - A B - 0 1 4 - 1 2 5 - 2 3 6 - - The DataFrame's length does not increase as a result of the update, - only values at matching index/column labels are updated. - - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) - >>> df.update(new_df) - >>> df - A B - 0 a d - 1 b e - 2 c f - - For Series, its name attribute must be set. - - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) - >>> df.update(new_column) - >>> df - A B - 0 a d - 1 b y - 2 c e - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) - >>> df.update(new_df) - >>> df - A B - 0 a x - 1 b d - 2 c e - - If `other` contains NaNs the corresponding values are not updated - in the original dataframe. - - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) - >>> df.update(new_df) - >>> df - A B - 0 1 4.0 - 1 2 500.0 - 2 3 6.0 - """ - import pandas.core.computation.expressions as expressions - - # TODO: Support other joins - if join != "left": # pragma: no cover - raise NotImplementedError("Only left join is supported") - if errors not in ["ignore", "raise"]: - raise ValueError("The parameter errors must be either 'ignore' or 'raise'") - - if not isinstance(other, DataFrame): - other = DataFrame(other) - - other = other.reindex_like(self) - - for col in self.columns: - this = self[col]._values - that = other[col]._values - if filter_func is not None: - with np.errstate(all="ignore"): - mask = ~filter_func(this) | isna(that) - else: - if errors == "raise": - mask_this = notna(that) - mask_that = notna(this) - if any(mask_this & mask_that): - raise ValueError("Data overlaps.") - - if overwrite: - mask = isna(that) - else: - mask = notna(this) - - # don't overwrite columns unnecessarily - if mask.all(): - continue - - self[col] = expressions.where(mask, this, that) - - # ---------------------------------------------------------------------- - # Data reshaping - @Appender( - """ -Examples --------- ->>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', -... 'Parrot', 'Parrot'], -... 'Max Speed': [380., 370., 24., 26.]}) ->>> df - Animal Max Speed -0 Falcon 380.0 -1 Falcon 370.0 -2 Parrot 24.0 -3 Parrot 26.0 ->>> df.groupby(['Animal']).mean() - Max Speed -Animal -Falcon 375.0 -Parrot 25.0 - -**Hierarchical Indexes** - -We can groupby different levels of a hierarchical index -using the `level` parameter: - ->>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], -... ['Captive', 'Wild', 'Captive', 'Wild']] ->>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) ->>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, -... index=index) ->>> df - Max Speed -Animal Type -Falcon Captive 390.0 - Wild 350.0 -Parrot Captive 30.0 - Wild 20.0 ->>> df.groupby(level=0).mean() - Max Speed -Animal -Falcon 370.0 -Parrot 25.0 ->>> df.groupby(level="Type").mean() - Max Speed -Type -Captive 210.0 -Wild 185.0 - -We can also choose to include NA in group keys or not by setting -`dropna` parameter, the default setting is `True`. - ->>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] ->>> df = pd.DataFrame(l, columns=["a", "b", "c"]) - ->>> df.groupby(by=["b"]).sum() - a c -b -1.0 2 3 -2.0 2 5 - ->>> df.groupby(by=["b"], dropna=False).sum() - a c -b -1.0 2 3 -2.0 2 5 -NaN 1 4 - ->>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] ->>> df = pd.DataFrame(l, columns=["a", "b", "c"]) - ->>> df.groupby(by="a").sum() - b c -a -a 13.0 13.0 -b 12.3 123.0 - ->>> df.groupby(by="a", dropna=False).sum() - b c -a -a 13.0 13.0 -b 12.3 123.0 -NaN 12.3 33.0 - -When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. -The ``group_keys`` argument defaults to ``True`` (include). - ->>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', -... 'Parrot', 'Parrot'], -... 'Max Speed': [380., 370., 24., 26.]}) ->>> df.groupby("Animal", group_keys=True).apply(lambda x: x) - Animal Max Speed -Animal -Falcon 0 Falcon 380.0 - 1 Falcon 370.0 -Parrot 2 Parrot 24.0 - 3 Parrot 26.0 - ->>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed -0 Falcon 380.0 -1 Falcon 370.0 -2 Parrot 24.0 -3 Parrot 26.0 -""" - ) - @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) - @validate_bool_kwargs_from_keywords('as_index', 'sort', 'observed', 'dropna') - def groupby( - self, - by=None, - axis: Axis = 0, - level: Level | None = None, - as_index: bool = True, - sort: bool = True, - group_keys: bool | lib.NoDefault = no_default, - squeeze: bool | lib.NoDefault = no_default, - observed: bool = False, - dropna: bool = True, - ) -> DataFrameGroupBy: - from pandas.core.groupby.generic import DataFrameGroupBy - - if squeeze is not no_default: - warnings.warn( - ( - "The `squeeze` parameter is deprecated and " - "will be removed in a future version." - ), - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - squeeze = False - - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") - axis = self._get_axis_number(axis) - - # https://github.com/python/mypy/issues/7642 - # error: Argument "squeeze" to "DataFrameGroupBy" has incompatible type - # "Union[bool, NoDefault]"; expected "bool" - return DataFrameGroupBy( - obj=self, - keys=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze, # type: ignore[arg-type] - observed=observed, - dropna=dropna, - ) - - _shared_docs[ - "pivot" - ] = """ - Return reshaped DataFrame organized by given index / column values. - - Reshape data (produce a "pivot" table) based on column values. Uses - unique values from specified `index` / `columns` to form axes of the - resulting DataFrame. This function does not support data - aggregation, multiple values will result in a MultiIndex in the - columns. See the :ref:`User Guide ` for more on reshaping. - - Parameters - ----------%s - index : str or object or a list of str, optional - Column to use to make new frame's index. If None, uses - existing index. - - .. versionchanged:: 1.1.0 - Also accept list of index names. - - columns : str or object or a list of str - Column to use to make new frame's columns. - - .. versionchanged:: 1.1.0 - Also accept list of columns names. - - values : str, object or a list of the previous, optional - Column(s) to use for populating new frame's values. If not - specified, all remaining columns will be used and the result will - have hierarchically indexed columns. - - Returns - ------- - DataFrame - Returns reshaped DataFrame. - - Raises - ------ - ValueError: - When there are any `index`, `columns` combinations with multiple - values. `DataFrame.pivot_table` when you need to aggregate. - - See Also - -------- - DataFrame.pivot_table : Generalization of pivot that can handle - duplicate values for one index/column pair. - DataFrame.unstack : Pivot based on the index values instead of a - column. - wide_to_long : Wide panel to long format. Less flexible but more - user-friendly than melt. - - Notes - ----- - For finer-tuned control, see hierarchical indexing documentation along - with the related stack/unstack methods. - - Reference :ref:`the user guide ` for more examples. - - Examples - -------- - >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', - ... 'two'], - ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - ... 'baz': [1, 2, 3, 4, 5, 6], - ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) - >>> df - foo bar baz zoo - 0 one A 1 x - 1 one B 2 y - 2 one C 3 z - 3 two A 4 q - 4 two B 5 w - 5 two C 6 t - - >>> df.pivot(index='foo', columns='bar', values='baz') - bar A B C - foo - one 1 2 3 - two 4 5 6 - - >>> df.pivot(index='foo', columns='bar')['baz'] - bar A B C - foo - one 1 2 3 - two 4 5 6 - - >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) - baz zoo - bar A B C A B C - foo - one 1 2 3 x y z - two 4 5 6 q w t - - You could also assign a list of column names or a list of index names. - - >>> df = pd.DataFrame({ - ... "lev1": [1, 1, 1, 2, 2, 2], - ... "lev2": [1, 1, 2, 1, 1, 2], - ... "lev3": [1, 2, 1, 2, 1, 2], - ... "lev4": [1, 2, 3, 4, 5, 6], - ... "values": [0, 1, 2, 3, 4, 5]}) - >>> df - lev1 lev2 lev3 lev4 values - 0 1 1 1 1 0 - 1 1 1 2 2 1 - 2 1 2 1 3 2 - 3 2 1 2 4 3 - 4 2 1 1 5 4 - 5 2 2 2 6 5 - - >>> df.pivot(index="lev1", columns=["lev2", "lev3"],values="values") - lev2 1 2 - lev3 1 2 1 2 - lev1 - 1 0.0 1.0 2.0 NaN - 2 4.0 3.0 NaN 5.0 - - >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values") - lev3 1 2 - lev1 lev2 - 1 1 0.0 1.0 - 2 2.0 NaN - 2 1 4.0 3.0 - 2 NaN 5.0 - - A ValueError is raised if there are any duplicates. - - >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], - ... "bar": ['A', 'A', 'B', 'C'], - ... "baz": [1, 2, 3, 4]}) - >>> df - foo bar baz - 0 one A 1 - 1 one A 2 - 2 two B 3 - 3 two C 4 - - Notice that the first two rows are the same for our `index` - and `columns` arguments. - - >>> df.pivot(index='foo', columns='bar', values='baz') - Traceback (most recent call last): - ... - ValueError: Index contains duplicate entries, cannot reshape - """ - - @Substitution("") - @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None) -> DataFrame: - from pandas.core.reshape.pivot import pivot - - return pivot(self, index=index, columns=columns, values=values) - - _shared_docs[ - "pivot_table" - ] = """ - Create a spreadsheet-style pivot table as a DataFrame. - - The levels in the pivot table will be stored in MultiIndex objects - (hierarchical indexes) on the index and columns of the result DataFrame. - - Parameters - ----------%s - values : column to aggregate, optional - index : column, Grouper, array, or list of the previous - If an array is passed, it must be the same length as the data. The - list can contain any of the other types (except list). - Keys to group by on the pivot table index. If an array is passed, - it is being used as the same manner as column values. - columns : column, Grouper, array, or list of the previous - If an array is passed, it must be the same length as the data. The - list can contain any of the other types (except list). - Keys to group by on the pivot table column. If an array is passed, - it is being used as the same manner as column values. - aggfunc : function, list of functions, dict, default numpy.mean - If list of functions passed, the resulting pivot table will have - hierarchical columns whose top level are the function names - (inferred from the function objects themselves) - If dict is passed, the key is column to aggregate and value - is function or list of functions. - fill_value : scalar, default None - Value to replace missing values with (in the resulting pivot table, - after aggregation). - margins : bool, default False - Add all row / columns (e.g. for subtotal / grand totals). - dropna : bool, default True - Do not include columns whose entries are all NaN. - margins_name : str, default 'All' - Name of the row / column that will contain the totals - when margins is True. - observed : bool, default False - This only applies if any of the groupers are Categoricals. - If True: only show observed values for categorical groupers. - If False: show all values for categorical groupers. - - .. versionchanged:: 0.25.0 - - sort : bool, default True - Specifies if the result should be sorted. - - .. versionadded:: 1.3.0 - - Returns - ------- - DataFrame - An Excel style pivot table. - - See Also - -------- - DataFrame.pivot : Pivot without aggregation that can handle - non-numeric data. - DataFrame.melt: Unpivot a DataFrame from wide to long format, - optionally leaving identifiers set. - wide_to_long : Wide panel to long format. Less flexible but more - user-friendly than melt. - - Notes - ----- - Reference :ref:`the user guide ` for more examples. - - Examples - -------- - >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", - ... "bar", "bar", "bar", "bar"], - ... "B": ["one", "one", "one", "two", "two", - ... "one", "one", "two", "two"], - ... "C": ["small", "large", "large", "small", - ... "small", "large", "small", "small", - ... "large"], - ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) - >>> df - A B C D E - 0 foo one small 1 2 - 1 foo one large 2 4 - 2 foo one large 2 5 - 3 foo two small 3 5 - 4 foo two small 3 6 - 5 bar one large 4 6 - 6 bar one small 5 8 - 7 bar two small 6 9 - 8 bar two large 7 9 - - This first example aggregates values by taking the sum. - - >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum) - >>> table - C large small - A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 - - We can also fill missing values using the `fill_value` parameter. - - >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum, fill_value=0) - >>> table - C large small - A B - bar one 4 5 - two 7 6 - foo one 4 1 - two 0 6 - - The next example aggregates by taking the mean across multiple columns. - - >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], - ... aggfunc={'D': np.mean, - ... 'E': np.mean}) - >>> table - D E - A C - bar large 5.500000 7.500000 - small 5.500000 8.500000 - foo large 2.000000 4.500000 - small 2.333333 4.333333 - - We can also calculate multiple types of aggregations for any given - value column. - - >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], - ... aggfunc={'D': np.mean, - ... 'E': [min, max, np.mean]}) - >>> table - D E - mean max mean min - A C - bar large 5.500000 9 7.500000 6 - small 5.500000 9 8.500000 8 - foo large 2.000000 5 4.500000 4 - small 2.333333 6 4.333333 2 - """ - - @Substitution("") - @Appender(_shared_docs["pivot_table"]) - def pivot_table( - self, - values=None, - index=None, - columns=None, - aggfunc="mean", - fill_value=None, - margins=False, - dropna=True, - margins_name="All", - observed=False, - sort=True, - ) -> DataFrame: - from pandas.core.reshape.pivot import pivot_table - - return pivot_table( - self, - values=values, - index=index, - columns=columns, - aggfunc=aggfunc, - fill_value=fill_value, - margins=margins, - dropna=dropna, - margins_name=margins_name, - observed=observed, - sort=sort, - ) - - @validate_bool_kwargs_from_keywords('dropna') - def stack(self, level: Level = -1, dropna: bool = True): - """ - Stack the prescribed level(s) from columns to index. - - Return a reshaped DataFrame or Series having a multi-level - index with one or more new inner-most levels compared to the current - DataFrame. The new inner-most levels are created by pivoting the - columns of the current dataframe: - - - if the columns have a single level, the output is a Series; - - if the columns have multiple levels, the new index - level(s) is (are) taken from the prescribed level(s) and - the output is a DataFrame. - - Parameters - ---------- - level : int, str, list, default -1 - Level(s) to stack from the column axis onto the index - axis, defined as one index or label, or a list of indices - or labels. - dropna : bool, default True - Whether to drop rows in the resulting Frame/Series with - missing values. Stacking a column level onto the index - axis can create combinations of index and column values - that are missing from the original dataframe. See Examples - section. - - Returns - ------- - DataFrame or Series - Stacked dataframe or series. - - See Also - -------- - DataFrame.unstack : Unstack prescribed level(s) from index axis - onto column axis. - DataFrame.pivot : Reshape dataframe from long format to wide - format. - DataFrame.pivot_table : Create a spreadsheet-style pivot table - as a DataFrame. - - Notes - ----- - The function is named by analogy with a collection of books - being reorganized from being side by side on a horizontal - position (the columns of the dataframe) to being stacked - vertically on top of each other (in the index of the - dataframe). - - Reference :ref:`the user guide ` for more examples. - - Examples - -------- - **Single level columns** - - >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], - ... index=['cat', 'dog'], - ... columns=['weight', 'height']) - - Stacking a dataframe with a single level column axis returns a Series: - - >>> df_single_level_cols - weight height - cat 0 1 - dog 2 3 - >>> df_single_level_cols.stack() - cat weight 0 - height 1 - dog weight 2 - height 3 - dtype: int64 - - **Multi level columns: simple case** - - >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('weight', 'pounds')]) - >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], - ... index=['cat', 'dog'], - ... columns=multicol1) - - Stacking a dataframe with a multi-level column axis: - - >>> df_multi_level_cols1 - weight - kg pounds - cat 1 2 - dog 2 4 - >>> df_multi_level_cols1.stack() - weight - cat kg 1 - pounds 2 - dog kg 2 - pounds 4 - - **Missing values** - - >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('height', 'm')]) - >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], - ... index=['cat', 'dog'], - ... columns=multicol2) - - It is common to have missing values when stacking a dataframe - with multi-level columns, as the stacked dataframe typically - has more values than the original dataframe. Missing values - are filled with NaNs: - - >>> df_multi_level_cols2 - weight height - kg m - cat 1.0 2.0 - dog 3.0 4.0 - >>> df_multi_level_cols2.stack() - height weight - cat kg NaN 1.0 - m 2.0 NaN - dog kg NaN 3.0 - m 4.0 NaN - - **Prescribing the level(s) to be stacked** - - The first parameter controls which level or levels are stacked: - - >>> df_multi_level_cols2.stack(0) - kg m - cat height NaN 2.0 - weight 1.0 NaN - dog height NaN 4.0 - weight 3.0 NaN - >>> df_multi_level_cols2.stack([0, 1]) - cat height m 2.0 - weight kg 1.0 - dog height m 4.0 - weight kg 3.0 - dtype: float64 - - **Dropping missing values** - - >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], - ... index=['cat', 'dog'], - ... columns=multicol2) - - Note that rows where all values are missing are dropped by - default but this behaviour can be controlled via the dropna - keyword parameter: - - >>> df_multi_level_cols3 - weight height - kg m - cat NaN 1.0 - dog 2.0 3.0 - >>> df_multi_level_cols3.stack(dropna=False) - height weight - cat kg NaN NaN - m 1.0 NaN - dog kg NaN 2.0 - m 3.0 NaN - >>> df_multi_level_cols3.stack(dropna=True) - height weight - cat m 1.0 NaN - dog kg NaN 2.0 - m 3.0 NaN - """ - from pandas.core.reshape.reshape import ( - stack, - stack_multiple, - ) - - if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna) - else: - result = stack(self, level, dropna=dropna) - - return result.__finalize__(self, method="stack") - - @validate_bool_kwargs_from_keywords('ignore_index') - def explode( - self, - column: IndexLabel, - ignore_index: bool = False, - ) -> DataFrame: - """ - Transform each element of a list-like to a row, replicating index values. - - .. versionadded:: 0.25.0 - - Parameters - ---------- - column : IndexLabel - Column(s) to explode. - For multiple columns, specify a non-empty list with each element - be str or tuple, and all specified columns their list-like data - on same row of the frame must have matching length. - - .. versionadded:: 1.3.0 - Multi-column explode - - ignore_index : bool, default False - If True, the resulting index will be labeled 0, 1, …, n - 1. - - .. versionadded:: 1.1.0 - - Returns - ------- - DataFrame - Exploded lists to rows of the subset columns; - index will be duplicated for these rows. - - Raises - ------ - ValueError : - * If columns of the frame are not unique. - * If specified columns to explode is empty list. - * If specified columns to explode have not matching count of - elements rowwise in the frame. - - See Also - -------- - DataFrame.unstack : Pivot a level of the (necessarily hierarchical) - index labels. - DataFrame.melt : Unpivot a DataFrame from wide format to long format. - Series.explode : Explode a DataFrame from list-like columns to long format. - - Notes - ----- - This routine will explode list-likes including lists, tuples, sets, - Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged, and empty list-likes will - result in a np.nan for that row. In addition, the ordering of rows in the - output will be non-deterministic when exploding sets. - - Reference :ref:`the user guide ` for more examples. - - Examples - -------- - >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], - ... 'B': 1, - ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) - >>> df - A B C - 0 [0, 1, 2] 1 [a, b, c] - 1 foo 1 NaN - 2 [] 1 [] - 3 [3, 4] 1 [d, e] - - Single-column explode. - - >>> df.explode('A') - A B C - 0 0 1 [a, b, c] - 0 1 1 [a, b, c] - 0 2 1 [a, b, c] - 1 foo 1 NaN - 2 NaN 1 [] - 3 3 1 [d, e] - 3 4 1 [d, e] - - Multi-column explode. - - >>> df.explode(list('AC')) - A B C - 0 0 1 a - 0 1 1 b - 0 2 1 c - 1 foo 1 NaN - 2 NaN 1 NaN - 3 3 1 d - 3 4 1 e - """ - if not self.columns.is_unique: - raise ValueError("columns must be unique") - - columns: list[Hashable] - if is_scalar(column) or isinstance(column, tuple): - columns = [column] - elif isinstance(column, list) and all( - map(lambda c: is_scalar(c) or isinstance(c, tuple), column) - ): - if not column: - raise ValueError("column must be nonempty") - if len(column) > len(set(column)): - raise ValueError("column must be unique") - columns = column - else: - raise ValueError("column must be a scalar, tuple, or list thereof") - - df = self.reset_index(drop=True) - if len(columns) == 1: - result = df[columns[0]].explode() - else: - mylen = lambda x: len(x) if is_list_like(x) else -1 - counts0 = self[columns[0]].apply(mylen) - for c in columns[1:]: - if not all(counts0 == self[c].apply(mylen)): - raise ValueError("columns must have matching element counts") - result = DataFrame({c: df[c].explode() for c in columns}) - result = df.drop(columns, axis=1).join(result) - if ignore_index: - result.index = default_index(len(result)) - else: - result.index = self.index.take(result.index) - result = result.reindex(columns=self.columns, copy=False) - - return result.__finalize__(self, method="explode") - - def unstack(self, level: Level = -1, fill_value=None): - """ - Pivot a level of the (necessarily hierarchical) index labels. - - Returns a DataFrame having a new level of column labels whose inner-most level - consists of the pivoted index labels. - - If the index is not a MultiIndex, the output will be a Series - (the analogue of stack when the columns are not a MultiIndex). - - Parameters - ---------- - level : int, str, or list of these, default -1 (last level) - Level(s) of index to unstack, can pass level name. - fill_value : int, str or dict - Replace NaN with this value if the unstack produces missing values. - - Returns - ------- - Series or DataFrame - - See Also - -------- - DataFrame.pivot : Pivot a table based on column values. - DataFrame.stack : Pivot a level of the column labels (inverse operation - from `unstack`). - - Notes - ----- - Reference :ref:`the user guide ` for more examples. - - Examples - -------- - >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), - ... ('two', 'a'), ('two', 'b')]) - >>> s = pd.Series(np.arange(1.0, 5.0), index=index) - >>> s - one a 1.0 - b 2.0 - two a 3.0 - b 4.0 - dtype: float64 - - >>> s.unstack(level=-1) - a b - one 1.0 2.0 - two 3.0 4.0 - - >>> s.unstack(level=0) - one two - a 1.0 3.0 - b 2.0 4.0 - - >>> df = s.unstack(level=0) - >>> df.unstack() - one a 1.0 - b 2.0 - two a 3.0 - b 4.0 - dtype: float64 - """ - from pandas.core.reshape.reshape import unstack - - result = unstack(self, level, fill_value) - - return result.__finalize__(self, method="unstack") - - @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) - def melt( - self, - id_vars=None, - value_vars=None, - var_name=None, - value_name="value", - col_level: Level | None = None, - ignore_index: bool = True, - ) -> DataFrame: - - return melt( - self, - id_vars=id_vars, - value_vars=value_vars, - var_name=var_name, - value_name=value_name, - col_level=col_level, - ignore_index=ignore_index, - ).__finalize__(self, method="melt") - - # ---------------------------------------------------------------------- - # Time series-related - - @doc( - Series.diff, - klass="DataFrame", - extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " - "Take difference over rows (0) or columns (1).\n", - other_klass="Series", - examples=dedent( - """ - Difference with previous row - - >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], - ... 'b': [1, 1, 2, 3, 5, 8], - ... 'c': [1, 4, 9, 16, 25, 36]}) - >>> df - a b c - 0 1 1 1 - 1 2 1 4 - 2 3 2 9 - 3 4 3 16 - 4 5 5 25 - 5 6 8 36 - - >>> df.diff() - a b c - 0 NaN NaN NaN - 1 1.0 0.0 3.0 - 2 1.0 1.0 5.0 - 3 1.0 1.0 7.0 - 4 1.0 2.0 9.0 - 5 1.0 3.0 11.0 - - Difference with previous column - - >>> df.diff(axis=1) - a b c - 0 NaN 0 0 - 1 NaN -1 3 - 2 NaN -1 7 - 3 NaN -1 13 - 4 NaN 0 20 - 5 NaN 2 28 - - Difference with 3rd previous row - - >>> df.diff(periods=3) - a b c - 0 NaN NaN NaN - 1 NaN NaN NaN - 2 NaN NaN NaN - 3 3.0 2.0 15.0 - 4 3.0 4.0 21.0 - 5 3.0 6.0 27.0 - - Difference with following row - - >>> df.diff(periods=-1) - a b c - 0 -1.0 0.0 -3.0 - 1 -1.0 -1.0 -5.0 - 2 -1.0 -1.0 -7.0 - 3 -1.0 -2.0 -9.0 - 4 -1.0 -3.0 -11.0 - 5 NaN NaN NaN - - Overflow in input dtype - - >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) - >>> df.diff() - a - 0 NaN - 1 255.0""" - ), - ) - def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: - if not lib.is_integer(periods): - if not ( - is_float(periods) - # error: "int" has no attribute "is_integer" - and periods.is_integer() # type: ignore[attr-defined] - ): - raise ValueError("periods must be an integer") - periods = int(periods) - - axis = self._get_axis_number(axis) - if axis == 1 and periods != 0: - return self - self.shift(periods, axis=axis) - - new_data = self._mgr.diff(n=periods, axis=axis) - return self._constructor(new_data).__finalize__(self, "diff") - - # ---------------------------------------------------------------------- - # Function application - - def _gotitem( - self, - key: IndexLabel, - ndim: int, - subset: DataFrame | Series | None = None, - ) -> DataFrame | Series: - """ - Sub-classes to define. Return a sliced object. - - Parameters - ---------- - key : string / list of selections - ndim : {1, 2} - requested ndim of result - subset : object, default None - subset to act on - """ - if subset is None: - subset = self - elif subset.ndim == 1: # is Series - return subset - - # TODO: _shallow_copy(subset)? - return subset[key] - - _agg_summary_and_see_also_doc = dedent( - """ - The aggregation operations are always performed over an axis, either the - index (default) or the column axis. This behavior is different from - `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, - `var`), where the default is to compute the aggregation of the flattened - array, e.g., ``numpy.mean(arr_2d)`` as opposed to - ``numpy.mean(arr_2d, axis=0)``. - - `agg` is an alias for `aggregate`. Use the alias. - - See Also - -------- - DataFrame.apply : Perform any type of operations. - DataFrame.transform : Perform transformation type operations. - core.groupby.GroupBy : Perform operations over groups. - core.resample.Resampler : Perform operations over resampled bins. - core.window.Rolling : Perform operations over rolling window. - core.window.Expanding : Perform operations over expanding window. - core.window.ExponentialMovingWindow : Perform operation over exponential weighted - window. - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> df = pd.DataFrame([[1, 2, 3], - ... [4, 5, 6], - ... [7, 8, 9], - ... [np.nan, np.nan, np.nan]], - ... columns=['A', 'B', 'C']) - - Aggregate these functions over the rows. - - >>> df.agg(['sum', 'min']) - A B C - sum 12.0 15.0 18.0 - min 1.0 2.0 3.0 - - Different aggregations per column. - - >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) - A B - sum 12.0 NaN - min 1.0 2.0 - max NaN 8.0 - - Aggregate different functions over the columns and rename the index of the resulting - DataFrame. - - >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) - A B C - x 7.0 NaN NaN - y NaN 2.0 NaN - z NaN NaN 6.0 - - Aggregate over the columns. - - >>> df.agg("mean", axis="columns") - 0 2.0 - 1 5.0 - 2 8.0 - 3 NaN - dtype: float64 - """ - ) - - @doc( - _shared_docs["aggregate"], - klass=_shared_doc_kwargs["klass"], - axis=_shared_doc_kwargs["axis"], - see_also=_agg_summary_and_see_also_doc, - examples=_agg_examples_doc, - ) - def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): - from pandas.core.apply import frame_apply - - axis = self._get_axis_number(axis) - - relabeling, func, columns, order = reconstruct_func(func, **kwargs) - - op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) - result = op.agg() - - if relabeling: - # This is to keep the order to columns occurrence unchanged, and also - # keep the order of new columns occurrence unchanged - - # For the return values of reconstruct_func, if relabeling is - # False, columns and order will be None. - assert columns is not None - assert order is not None - - result_in_dict = relabel_result(result, func, columns, order) - result = DataFrame(result_in_dict, index=columns) - - return result - - agg = aggregate - - @doc( - _shared_docs["transform"], - klass=_shared_doc_kwargs["klass"], - axis=_shared_doc_kwargs["axis"], - ) - def transform( - self, func: AggFuncType, axis: Axis = 0, *args, **kwargs - ) -> DataFrame: - from pandas.core.apply import frame_apply - - op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) - result = op.transform() - assert isinstance(result, DataFrame) - return result - - @validate_bool_kwargs_from_keywords('raw') - def apply( - self, - func: AggFuncType, - axis: Axis = 0, - raw: bool = False, - result_type=None, - args=(), - **kwargs, - ): - """ - Apply a function along an axis of the DataFrame. - - Objects passed to the function are Series objects whose index is - either the DataFrame's index (``axis=0``) or the DataFrame's columns - (``axis=1``). By default (``result_type=None``), the final return type - is inferred from the return type of the applied function. Otherwise, - it depends on the `result_type` argument. - - Parameters - ---------- - func : function - Function to apply to each column or row. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Axis along which the function is applied: - - * 0 or 'index': apply function to each column. - * 1 or 'columns': apply function to each row. - - raw : bool, default False - Determines if row or column is passed as a Series or ndarray object: - - * ``False`` : passes each row or column as a Series to the - function. - * ``True`` : the passed function will receive ndarray objects - instead. - If you are just applying a NumPy reduction function this will - achieve much better performance. - - result_type : {'expand', 'reduce', 'broadcast', None}, default None - These only act when ``axis=1`` (columns): - - * 'expand' : list-like results will be turned into columns. - * 'reduce' : returns a Series if possible rather than expanding - list-like results. This is the opposite of 'expand'. - * 'broadcast' : results will be broadcast to the original shape - of the DataFrame, the original index and columns will be - retained. - - The default behaviour (None) depends on the return value of the - applied function: list-like results will be returned as a Series - of those. However if the apply function returns a Series these - are expanded to columns. - args : tuple - Positional arguments to pass to `func` in addition to the - array/series. - **kwargs - Additional keyword arguments to pass as keywords arguments to - `func`. - - Returns - ------- - Series or DataFrame - Result of applying ``func`` along the given axis of the - DataFrame. - - See Also - -------- - DataFrame.applymap: For elementwise operations. - DataFrame.aggregate: Only perform aggregating type operations. - DataFrame.transform: Only perform transforming type operations. - - Notes - ----- - Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` - for more details. - - Examples - -------- - >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) - >>> df - A B - 0 4 9 - 1 4 9 - 2 4 9 - - Using a numpy universal function (in this case the same as - ``np.sqrt(df)``): - - >>> df.apply(np.sqrt) - A B - 0 2.0 3.0 - 1 2.0 3.0 - 2 2.0 3.0 - - Using a reducing function on either axis - - >>> df.apply(np.sum, axis=0) - A 12 - B 27 - dtype: int64 - - >>> df.apply(np.sum, axis=1) - 0 13 - 1 13 - 2 13 - dtype: int64 - - Returning a list-like will result in a Series - - >>> df.apply(lambda x: [1, 2], axis=1) - 0 [1, 2] - 1 [1, 2] - 2 [1, 2] - dtype: object - - Passing ``result_type='expand'`` will expand list-like results - to columns of a Dataframe - - >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') - 0 1 - 0 1 2 - 1 1 2 - 2 1 2 - - Returning a Series inside the function is similar to passing - ``result_type='expand'``. The resulting column names - will be the Series index. - - >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) - foo bar - 0 1 2 - 1 1 2 - 2 1 2 - - Passing ``result_type='broadcast'`` will ensure the same shape - result, whether list-like or scalar is returned by the function, - and broadcast it along the axis. The resulting column names will - be the originals. - - >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') - A B - 0 1 2 - 1 1 2 - 2 1 2 - """ - from pandas.core.apply import frame_apply - - op = frame_apply( - self, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - args=args, - kwargs=kwargs, - ) - return op.apply().__finalize__(self, method="apply") - - def applymap( - self, func: PythonFuncType, na_action: str | None = None, **kwargs - ) -> DataFrame: - """ - Apply a function to a Dataframe elementwise. - - This method applies a function that accepts and returns a scalar - to every element of a DataFrame. - - Parameters - ---------- - func : callable - Python function, returns a single value from a single value. - na_action : {None, 'ignore'}, default None - If ‘ignore’, propagate NaN values, without passing them to func. - - .. versionadded:: 1.2 - - **kwargs - Additional keyword arguments to pass as keywords arguments to - `func`. - - .. versionadded:: 1.3.0 - - Returns - ------- - DataFrame - Transformed DataFrame. - - See Also - -------- - DataFrame.apply : Apply a function along input axis of DataFrame. - - Examples - -------- - >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) - >>> df - 0 1 - 0 1.000 2.120 - 1 3.356 4.567 - - >>> df.applymap(lambda x: len(str(x))) - 0 1 - 0 3 4 - 1 5 5 - - Like Series.map, NA values can be ignored: - - >>> df_copy = df.copy() - >>> df_copy.iloc[0, 0] = pd.NA - >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') - 0 1 - 0 NaN 4 - 1 5.0 5 - - Note that a vectorized version of `func` often exists, which will - be much faster. You could square each number elementwise. - - >>> df.applymap(lambda x: x**2) - 0 1 - 0 1.000000 4.494400 - 1 11.262736 20.857489 - - But it's better to avoid applymap in that case. - - >>> df ** 2 - 0 1 - 0 1.000000 4.494400 - 1 11.262736 20.857489 - """ - if na_action not in {"ignore", None}: - raise ValueError( - f"na_action must be 'ignore' or None. Got {repr(na_action)}" - ) - ignore_na = na_action == "ignore" - func = functools.partial(func, **kwargs) - - # if we have a dtype == 'M8[ns]', provide boxed values - def infer(x): - if x.empty: - return lib.map_infer(x, func, ignore_na=ignore_na) - return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) - - return self.apply(infer).__finalize__(self, "applymap") - - # ---------------------------------------------------------------------- - # Merging / joining methods - - @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity', 'sort') - def append( - self, - other, - ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, - ) -> DataFrame: - """ - Append rows of `other` to the end of caller, returning a new object. - - .. deprecated:: 1.4.0 - Use :func:`concat` instead. For further details see - :ref:`whatsnew_140.deprecations.frame_series_append` - - Columns in `other` that are not in the caller are added as new columns. - - Parameters - ---------- - other : DataFrame or Series/dict-like object, or list of these - The data to append. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. - verify_integrity : bool, default False - If True, raise ValueError on creating index with duplicates. - sort : bool, default False - Sort columns if the columns of `self` and `other` are not aligned. - - .. versionchanged:: 1.0.0 - - Changed to not sort by default. - - Returns - ------- - DataFrame - A new DataFrame consisting of the rows of caller and the rows of `other`. - - See Also - -------- - concat : General function to concatenate DataFrame or Series objects. - - Notes - ----- - If a list of dict/series is passed and the keys are all contained in - the DataFrame's index, the order of the columns in the resulting - DataFrame will be unchanged. - - Iteratively appending rows to a DataFrame can be more computationally - intensive than a single concatenate. A better solution is to append - those rows to a list and then concatenate the list with the original - DataFrame all at once. - - Examples - -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y']) - >>> df - A B - x 1 2 - y 3 4 - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y']) - >>> df.append(df2) - A B - x 1 2 - y 3 4 - x 5 6 - y 7 8 - - With `ignore_index` set to True: - - >>> df.append(df2, ignore_index=True) - A B - 0 1 2 - 1 3 4 - 2 5 6 - 3 7 8 - - The following, while not recommended methods for generating DataFrames, - show two ways to generate a DataFrame from multiple data sources. - - Less efficient: - - >>> df = pd.DataFrame(columns=['A']) - >>> for i in range(5): - ... df = df.append({'A': i}, ignore_index=True) - >>> df - A - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - - More efficient: - - >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], - ... ignore_index=True) - A - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - """ - warnings.warn( - "The frame.append method is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.concat instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return self._append(other, ignore_index, verify_integrity, sort) - - @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity', 'sort') - def _append( - self, - other, - ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, - ) -> DataFrame: - combined_columns = None - if isinstance(other, (Series, dict)): - if isinstance(other, dict): - if not ignore_index: - raise TypeError("Can only append a dict if ignore_index=True") - other = Series(other) - if other.name is None and not ignore_index: - raise TypeError( - "Can only append a Series if ignore_index=True " - "or if the Series has a name" - ) - - index = Index([other.name], name=self.index.name) - idx_diff = other.index.difference(self.columns) - combined_columns = self.columns.append(idx_diff) - row_df = other.to_frame().T - # infer_objects is needed for - # test_append_empty_frame_to_series_with_dateutil_tz - other = row_df.infer_objects().rename_axis(index.names, copy=False) - elif isinstance(other, list): - if not other: - pass - elif not isinstance(other[0], DataFrame): - other = DataFrame(other) - if self.index.name is not None and not ignore_index: - other.index.name = self.index.name - - from pandas.core.reshape.concat import concat - - if isinstance(other, (list, tuple)): - to_concat = [self, *other] - else: - to_concat = [self, other] - - result = concat( - to_concat, - ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, - ) - if ( - combined_columns is not None - and not sort - and not combined_columns.equals(result.columns) - ): - # TODO: reindexing here is a kludge bc union_indexes does not - # pass sort to index.union, xref #43375 - # combined_columns.equals check is necessary for preserving dtype - # in test_crosstab_normalize - result = result.reindex(combined_columns, axis=1) - return result.__finalize__(self, method="append") - - @validate_bool_kwargs_from_keywords('sort') - def join( - self, - other: DataFrame | Series, - on: IndexLabel | None = None, - how: str = "left", - lsuffix: str = "", - rsuffix: str = "", - sort: bool = False, - validate: str | None = None, - ) -> DataFrame: - """ - Join columns of another DataFrame. - - Join columns with `other` DataFrame either on index or on a key - column. Efficiently join multiple DataFrame objects by index at once by - passing a list. - - Parameters - ---------- - other : DataFrame, Series, or list of DataFrame - Index should be similar to one of the columns in this one. If a - Series is passed, its name attribute must be set, and that will be - used as the column name in the resulting joined DataFrame. - on : str, list of str, or array-like, optional - Column or index level name(s) in the caller to join on the index - in `other`, otherwise joins index-on-index. If multiple - values given, the `other` DataFrame must have a MultiIndex. Can - pass an array as the join key if it is not already contained in - the calling DataFrame. Like an Excel VLOOKUP operation. - how : {'left', 'right', 'outer', 'inner'}, default 'left' - How to handle the operation of the two objects. - - * left: use calling frame's index (or column if on is specified) - * right: use `other`'s index. - * outer: form union of calling frame's index (or column if on is - specified) with `other`'s index, and sort it. - lexicographically. - * inner: form intersection of calling frame's index (or column if - on is specified) with `other`'s index, preserving the order - of the calling's one. - * cross: creates the cartesian product from both frames, preserves the order - of the left keys. - - .. versionadded:: 1.2.0 - - lsuffix : str, default '' - Suffix to use from left frame's overlapping columns. - rsuffix : str, default '' - Suffix to use from right frame's overlapping columns. - sort : bool, default False - Order result DataFrame lexicographically by the join key. If False, - the order of the join key depends on the join type (how keyword). - validate : str, optional - If specified, checks if join is of specified type. - * "one_to_one" or "1:1": check if join keys are unique in both left - and right datasets. - * "one_to_many" or "1:m": check if join keys are unique in left dataset. - * "many_to_one" or "m:1": check if join keys are unique in right dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. - .. versionadded:: 1.5.0 - - Returns - ------- - DataFrame - A dataframe containing columns from both the caller and `other`. - - See Also - -------- - DataFrame.merge : For column(s)-on-column(s) operations. - - Notes - ----- - Parameters `on`, `lsuffix`, and `rsuffix` are not supported when - passing a list of `DataFrame` objects. - - Support for specifying index levels as the `on` parameter was added - in version 0.23.0. - - Examples - -------- - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], - ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) - - >>> df - key A - 0 K0 A0 - 1 K1 A1 - 2 K2 A2 - 3 K3 A3 - 4 K4 A4 - 5 K5 A5 - - >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], - ... 'B': ['B0', 'B1', 'B2']}) - - >>> other - key B - 0 K0 B0 - 1 K1 B1 - 2 K2 B2 - - Join DataFrames using their indexes. - - >>> df.join(other, lsuffix='_caller', rsuffix='_other') - key_caller A key_other B - 0 K0 A0 K0 B0 - 1 K1 A1 K1 B1 - 2 K2 A2 K2 B2 - 3 K3 A3 NaN NaN - 4 K4 A4 NaN NaN - 5 K5 A5 NaN NaN - - If we want to join using the key columns, we need to set key to be - the index in both `df` and `other`. The joined DataFrame will have - key as its index. - - >>> df.set_index('key').join(other.set_index('key')) - A B - key - K0 A0 B0 - K1 A1 B1 - K2 A2 B2 - K3 A3 NaN - K4 A4 NaN - K5 A5 NaN - - Another option to join using the key columns is to use the `on` - parameter. DataFrame.join always uses `other`'s index but we can use - any column in `df`. This method preserves the original DataFrame's - index in the result. - - >>> df.join(other.set_index('key'), on='key') - key A B - 0 K0 A0 B0 - 1 K1 A1 B1 - 2 K2 A2 B2 - 3 K3 A3 NaN - 4 K4 A4 NaN - 5 K5 A5 NaN - - Using non-unique key values shows how they are matched. - - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], - ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) - - >>> df - key A - 0 K0 A0 - 1 K1 A1 - 2 K1 A2 - 3 K3 A3 - 4 K0 A4 - 5 K1 A5 - - >>> df.join(other.set_index('key'), on='key', validate='m:1') - key A B - 0 K0 A0 B0 - 1 K1 A1 B1 - 2 K1 A2 B1 - 3 K3 A3 NaN - 4 K0 A4 B0 - 5 K1 A5 B1 - """ - return self._join_compat( - other, - on=on, - how=how, - lsuffix=lsuffix, - rsuffix=rsuffix, - sort=sort, - validate=validate, - ) - - @validate_bool_kwargs_from_keywords('sort') - def _join_compat( - self, - other: DataFrame | Series, - on: IndexLabel | None = None, - how: str = "left", - lsuffix: str = "", - rsuffix: str = "", - sort: bool = False, - validate: str | None = None, - ): - from pandas.core.reshape.concat import concat - from pandas.core.reshape.merge import merge - - if isinstance(other, Series): - if other.name is None: - raise ValueError("Other Series must have a name") - other = DataFrame({other.name: other}) - - if isinstance(other, DataFrame): - if how == "cross": - return merge( - self, - other, - how=how, - on=on, - suffixes=(lsuffix, rsuffix), - sort=sort, - validate=validate, - ) - return merge( - self, - other, - left_on=on, - how=how, - left_index=on is None, - right_index=True, - suffixes=(lsuffix, rsuffix), - sort=sort, - validate=validate, - ) - else: - if on is not None: - raise ValueError( - "Joining multiple DataFrames only supported for joining on index" - ) - - if rsuffix or lsuffix: - raise ValueError( - "Suffixes not supported when joining multiple DataFrames" - ) - - frames = [self] + list(other) - - can_concat = all(df.index.is_unique for df in frames) - - # join indexes only using concat - if can_concat: - if how == "left": - res = concat( - frames, axis=1, join="outer", verify_integrity=True, sort=sort - ) - return res.reindex(self.index, copy=False) - else: - return concat( - frames, axis=1, join=how, verify_integrity=True, sort=sort - ) - - joined = frames[0] - - for frame in frames[1:]: - joined = merge( - joined, - frame, - how=how, - left_index=True, - right_index=True, - validate=validate, - ) - - return joined - - @Substitution("") - @Appender(_merge_doc, indents=2) - @validate_bool_kwargs_from_keywords('left_index', 'right_index', 'copy', 'indicator', 'sort') - def merge( - self, - right: DataFrame | Series, - how: str = "inner", - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, - left_index: bool = False, - right_index: bool = False, - sort: bool = False, - suffixes: Suffixes = ("_x", "_y"), - copy: bool = True, - indicator: bool = False, - validate: str | None = None, - ) -> DataFrame: - from pandas.core.reshape.merge import merge - - return merge( - self, - right, - how=how, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - sort=sort, - suffixes=suffixes, - copy=copy, - indicator=indicator, - validate=validate, - ) - - def round( - self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs - ) -> DataFrame: - """ - Round a DataFrame to a variable number of decimal places. - - Parameters - ---------- - decimals : int, dict, Series - Number of decimal places to round each column to. If an int is - given, round each column to the same number of places. - Otherwise dict and Series round to variable numbers of places. - Column names should be in the keys if `decimals` is a - dict-like, or in the index if `decimals` is a Series. Any - columns not included in `decimals` will be left as is. Elements - of `decimals` which are not columns of the input will be - ignored. - *args - Additional keywords have no effect but might be accepted for - compatibility with numpy. - **kwargs - Additional keywords have no effect but might be accepted for - compatibility with numpy. - - Returns - ------- - DataFrame - A DataFrame with the affected columns rounded to the specified - number of decimal places. - - See Also - -------- - numpy.around : Round a numpy array to the given number of decimals. - Series.round : Round a Series to the given number of decimals. - - Examples - -------- - >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], - ... columns=['dogs', 'cats']) - >>> df - dogs cats - 0 0.21 0.32 - 1 0.01 0.67 - 2 0.66 0.03 - 3 0.21 0.18 - - By providing an integer each column is rounded to the same number - of decimal places - - >>> df.round(1) - dogs cats - 0 0.2 0.3 - 1 0.0 0.7 - 2 0.7 0.0 - 3 0.2 0.2 - - With a dict, the number of places for specific columns can be - specified with the column names as key and the number of decimal - places as value - - >>> df.round({'dogs': 1, 'cats': 0}) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - - Using a Series, the number of places for specific columns can be - specified with the column names as index and the number of - decimal places as value - - >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) - >>> df.round(decimals) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - """ - from pandas.core.reshape.concat import concat - - def _dict_round(df: DataFrame, decimals): - for col, vals in df.items(): - try: - yield _series_round(vals, decimals[col]) - except KeyError: - yield vals - - def _series_round(ser: Series, decimals: int): - if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): - return ser.round(decimals) - return ser - - nv.validate_round(args, kwargs) - - if isinstance(decimals, (dict, Series)): - if isinstance(decimals, Series) and not decimals.index.is_unique: - raise ValueError("Index of decimals must be unique") - if is_dict_like(decimals) and not all( - is_integer(value) for _, value in decimals.items() - ): - raise TypeError("Values in decimals must be integers") - new_cols = list(_dict_round(self, decimals)) - elif is_integer(decimals): - # Dispatch to Series.round - new_cols = [_series_round(v, decimals) for _, v in self.items()] - else: - raise TypeError("decimals must be an integer, a dict-like or a Series") - - if len(new_cols) > 0: - return self._constructor( - concat(new_cols, axis=1), index=self.index, columns=self.columns - ).__finalize__(self, method="round") - else: - return self - - # ---------------------------------------------------------------------- - # Statistical methods, etc. - - @validate_bool_kwargs_from_keywords('numeric_only') - def corr( - self, - method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", - min_periods: int = 1, - numeric_only: bool = True, - ) -> DataFrame: - """ - Compute pairwise correlation of columns, excluding NA/null values. - - Parameters - ---------- - method : {'pearson', 'kendall', 'spearman'} or callable - Method of correlation: - - * pearson : standard correlation coefficient - * kendall : Kendall Tau correlation coefficient - * spearman : Spearman rank correlation - * callable: callable with input two 1d ndarrays - and returning a float. Note that the returned matrix from corr - will have 1 along the diagonals and will be symmetric - regardless of the callable's behavior. - min_periods : int, optional - Minimum number of observations required per pair of columns - to have a valid result. Currently only available for Pearson - and Spearman correlation. - numeric_only : bool, default True - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - DataFrame - Correlation matrix. - - See Also - -------- - DataFrame.corrwith : Compute pairwise correlation with another - DataFrame or Series. - Series.corr : Compute the correlation between two Series. - - Notes - ----- - Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. - - * `Pearson correlation coefficient `_ - * `Kendall rank correlation coefficient `_ - * `Spearman's rank correlation coefficient `_ - - Examples - -------- - >>> def histogram_intersection(a, b): - ... v = np.minimum(a, b).sum().round(decimals=1) - ... return v - >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], - ... columns=['dogs', 'cats']) - >>> df.corr(method=histogram_intersection) - dogs cats - dogs 1.0 0.3 - cats 0.3 1.0 - - >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], - ... columns=['dogs', 'cats']) - >>> df.corr(min_periods=3) - dogs cats - dogs 1.0 NaN - cats NaN 1.0 - """ # noqa:E501 - if numeric_only: - data = self._get_numeric_data() - else: - data = self - cols = data.columns - idx = cols.copy() - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) - - if method == "pearson": - correl = libalgos.nancorr(mat, minp=min_periods) - elif method == "spearman": - correl = libalgos.nancorr_spearman(mat, minp=min_periods) - elif method == "kendall" or callable(method): - if min_periods is None: - min_periods = 1 - mat = mat.T - corrf = nanops.get_corr_func(method) - K = len(cols) - correl = np.empty((K, K), dtype=float) - mask = np.isfinite(mat) - for i, ac in enumerate(mat): - for j, bc in enumerate(mat): - if i > j: - continue - - valid = mask[i] & mask[j] - if valid.sum() < min_periods: - c = np.nan - elif i == j: - c = 1.0 - elif not valid.all(): - c = corrf(ac[valid], bc[valid]) - else: - c = corrf(ac, bc) - correl[i, j] = c - correl[j, i] = c - else: - raise ValueError( - "method must be either 'pearson', " - "'spearman', 'kendall', or a callable, " - f"'{method}' was supplied" - ) - - return self._constructor(correl, index=idx, columns=cols) - - @validate_bool_kwargs_from_keywords('numeric_only') - def cov( - self, - min_periods: int | None = None, - ddof: int | None = 1, - numeric_only: bool = True, - ) -> DataFrame: - """ - Compute pairwise covariance of columns, excluding NA/null values. - - Compute the pairwise covariance among the series of a DataFrame. - The returned data frame is the `covariance matrix - `__ of the columns - of the DataFrame. - - Both NA and null values are automatically excluded from the - calculation. (See the note below about bias from missing values.) - A threshold can be set for the minimum number of - observations for each value created. Comparisons with observations - below this threshold will be returned as ``NaN``. - - This method is generally used for the analysis of time series data to - understand the relationship between different measures - across time. - - Parameters - ---------- - min_periods : int, optional - Minimum number of observations required per pair of columns - to have a valid result. - - ddof : int, default 1 - Delta degrees of freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - - .. versionadded:: 1.1.0 - - numeric_only : bool, default True - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - DataFrame - The covariance matrix of the series of the DataFrame. - - See Also - -------- - Series.cov : Compute covariance with another Series. - core.window.ExponentialMovingWindow.cov: Exponential weighted sample covariance. - core.window.Expanding.cov : Expanding sample covariance. - core.window.Rolling.cov : Rolling sample covariance. - - Notes - ----- - Returns the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-ddof. - - For DataFrames that have Series that are missing data (assuming that - data is `missing at random - `__) - the returned covariance matrix will be an unbiased estimate - of the variance and covariance between the member Series. - - However, for many applications this estimate may not be acceptable - because the estimate covariance matrix is not guaranteed to be positive - semi-definite. This could lead to estimate correlations having - absolute values which are greater than one, and/or a non-invertible - covariance matrix. See `Estimation of covariance matrices - `__ for more details. - - Examples - -------- - >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], - ... columns=['dogs', 'cats']) - >>> df.cov() - dogs cats - dogs 0.666667 -1.000000 - cats -1.000000 1.666667 - - >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(1000, 5), - ... columns=['a', 'b', 'c', 'd', 'e']) - >>> df.cov() - a b c d e - a 0.998438 -0.020161 0.059277 -0.008943 0.014144 - b -0.020161 1.059352 -0.008543 -0.024738 0.009826 - c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 - d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 - e 0.014144 0.009826 -0.000271 -0.013692 0.977795 - - **Minimum number of periods** - - This method also supports an optional ``min_periods`` keyword - that specifies the required minimum number of non-NA observations for - each column pair in order to have a valid result: - - >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(20, 3), - ... columns=['a', 'b', 'c']) - >>> df.loc[df.index[:5], 'a'] = np.nan - >>> df.loc[df.index[5:10], 'b'] = np.nan - >>> df.cov(min_periods=12) - a b c - a 0.316741 NaN -0.150812 - b NaN 1.248003 0.191417 - c -0.150812 0.191417 0.895202 - """ - if numeric_only: - data = self._get_numeric_data() - else: - data = self - cols = data.columns - idx = cols.copy() - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) - - if notna(mat).all(): - if min_periods is not None and min_periods > len(mat): - base_cov = np.empty((mat.shape[1], mat.shape[1])) - base_cov.fill(np.nan) - else: - base_cov = np.cov(mat.T, ddof=ddof) - base_cov = base_cov.reshape((len(cols), len(cols))) - else: - base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) - - return self._constructor(base_cov, index=idx, columns=cols) - - @validate_bool_kwargs_from_keywords('numeric_only') - def corrwith( - self, - other, - axis: Axis = 0, - drop=False, - method="pearson", - numeric_only: bool = True, - ) -> Series: - """ - Compute pairwise correlation. - - Pairwise correlation is computed between rows or columns of - DataFrame with rows or columns of Series or DataFrame. DataFrames - are first aligned along both axes before computing the - correlations. - - Parameters - ---------- - other : DataFrame, Series - Object with which to compute correlations. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for - row-wise. - drop : bool, default False - Drop missing indices from result. - method : {'pearson', 'kendall', 'spearman'} or callable - Method of correlation: - - * pearson : standard correlation coefficient - * kendall : Kendall Tau correlation coefficient - * spearman : Spearman rank correlation - * callable: callable with input two 1d ndarrays - and returning a float. - - numeric_only : bool, default True - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Pairwise correlations. - - See Also - -------- - DataFrame.corr : Compute pairwise correlation of columns. - - Examples - -------- - >>> index = ["a", "b", "c", "d", "e"] - >>> columns = ["one", "two", "three", "four"] - >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) - >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) - >>> df1.corrwith(df2) - one 1.0 - two 1.0 - three 1.0 - four 1.0 - dtype: float64 - - >>> df2.corrwith(df1, axis=1) - a 1.0 - b 1.0 - c 1.0 - d 1.0 - e NaN - dtype: float64 - """ # noqa:E501 - axis = self._get_axis_number(axis) - if numeric_only: - this = self._get_numeric_data() - else: - this = self - - # GH46174: when other is a Series object and axis=0, we achieve a speedup over - # passing .corr() to .apply() by taking the columns as ndarrays and iterating - # over the transposition row-wise. Then we delegate the correlation coefficient - # computation and null-masking to np.corrcoef and np.isnan respectively, - # which are much faster. We exploit the fact that the Spearman correlation - # of two vectors is equal to the Pearson correlation of their ranks to use - # substantially the same method for Pearson and Spearman, - # just with intermediate argsorts on the latter. - if isinstance(other, Series): - if axis == 0 and method in ["pearson", "spearman"]: - corrs = {} - if numeric_only: - cols = self.select_dtypes(include=np.number).columns - ndf = self[cols].values.transpose() - else: - cols = self.columns - ndf = self.values.transpose() - k = other.values - if method == "pearson": - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ - 0, 1 - ] - else: - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[cols[i]] = np.corrcoef( - r[nonnull_mask].argsort().argsort(), - k[nonnull_mask].argsort().argsort(), - )[0, 1] - return Series(corrs) - else: - return this.apply(lambda x: other.corr(x, method=method), axis=axis) - - other = other._get_numeric_data() - left, right = this.align(other, join="inner", copy=False) - - if axis == 1: - left = left.T - right = right.T - - if method == "pearson": - # mask missing values - left = left + right * 0 - right = right + left * 0 - - # demeaned data - ldem = left - left.mean() - rdem = right - right.mean() - - num = (ldem * rdem).sum() - dom = (left.count() - 1) * left.std() * right.std() - - correl = num / dom - - elif method in ["kendall", "spearman"] or callable(method): - - def c(x): - return nanops.nancorr(x[0], x[1], method=method) - - correl = self._constructor_sliced( - map(c, zip(left.values.T, right.values.T)), index=left.columns - ) - - else: - raise ValueError( - f"Invalid method {method} was passed, " - "valid methods are: 'pearson', 'kendall', " - "'spearman', or callable" - ) - - if not drop: - # Find non-matching labels along the given axis - # and append missing correlations (GH 22375) - raxis = 1 if axis == 0 else 0 - result_index = this._get_axis(raxis).union(other._get_axis(raxis)) - idx_diff = result_index.difference(correl.index) - - if len(idx_diff) > 0: - correl = correl._append( - Series([np.nan] * len(idx_diff), index=idx_diff) - ) - - return correl - - # ---------------------------------------------------------------------- - # ndarray-like stats methods - - @validate_bool_kwargs_from_keywords('numeric_only') - def count( - self, axis: Axis = 0, level: Level | None = None, numeric_only: bool = False - ): - """ - Count non-NA cells for each column or row. - - The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending - on `pandas.options.mode.use_inf_as_na`) are considered NA. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - If 0 or 'index' counts are generated for each column. - If 1 or 'columns' counts are generated for each row. - level : int or str, optional - If the axis is a `MultiIndex` (hierarchical), count along a - particular `level`, collapsing into a `DataFrame`. - A `str` specifies the level name. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - Returns - ------- - Series or DataFrame - For each column/row the number of non-NA/null entries. - If `level` is specified returns a `DataFrame`. - - See Also - -------- - Series.count: Number of non-NA elements in a Series. - DataFrame.value_counts: Count unique combinations of columns. - DataFrame.shape: Number of DataFrame rows and columns (including NA - elements). - DataFrame.isna: Boolean same-sized DataFrame showing places of NA - elements. - - Examples - -------- - Constructing DataFrame from a dictionary: - - >>> df = pd.DataFrame({"Person": - ... ["John", "Myla", "Lewis", "John", "Myla"], - ... "Age": [24., np.nan, 21., 33, 26], - ... "Single": [False, True, True, True, False]}) - >>> df - Person Age Single - 0 John 24.0 False - 1 Myla NaN True - 2 Lewis 21.0 True - 3 John 33.0 True - 4 Myla 26.0 False - - Notice the uncounted NA values: - - >>> df.count() - Person 5 - Age 4 - Single 5 - dtype: int64 - - Counts for each **row**: - - >>> df.count(axis='columns') - 0 3 - 1 2 - 2 3 - 3 3 - 4 3 - dtype: int64 - """ - axis = self._get_axis_number(axis) - if level is not None: - warnings.warn( - "Using the level keyword in DataFrame and Series aggregations is " - "deprecated and will be removed in a future version. Use groupby " - "instead. df.count(level=1) should use df.groupby(level=1).count().", - FutureWarning, - stacklevel=find_stack_level(), - ) - res = self._count_level(level, axis=axis, numeric_only=numeric_only) - return res.__finalize__(self, method="count") - - if numeric_only: - frame = self._get_numeric_data() - else: - frame = self - - # GH #423 - if len(frame._get_axis(axis)) == 0: - result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) - else: - if frame._is_mixed_type or frame._mgr.any_extension_types: - # the or any_extension_types is really only hit for single- - # column frames with an extension array - result = notna(frame).sum(axis=axis) - else: - # GH13407 - series_counts = notna(frame).sum(axis=axis) - counts = series_counts.values - result = self._constructor_sliced( - counts, index=frame._get_agg_axis(axis) - ) - - return result.astype("int64").__finalize__(self, method="count") - - @validate_bool_kwargs_from_keywords('numeric_only') - def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): - if numeric_only: - frame = self._get_numeric_data() - else: - frame = self - - count_axis = frame._get_axis(axis) - agg_axis = frame._get_agg_axis(axis) - - if not isinstance(count_axis, MultiIndex): - raise TypeError( - f"Can only count levels on hierarchical {self._get_axis_name(axis)}." - ) - - # Mask NaNs: Mask rows or columns where the index level is NaN, and all - # values in the DataFrame that are NaN - if frame._is_mixed_type: - # Since we have mixed types, calling notna(frame.values) might - # upcast everything to object - values_mask = notna(frame).values - else: - # But use the speedup when we have homogeneous dtypes - values_mask = notna(frame.values) - - index_mask = notna(count_axis.get_level_values(level=level)) - if axis == 1: - mask = index_mask & values_mask - else: - mask = index_mask.reshape(-1, 1) & values_mask - - if isinstance(level, str): - level = count_axis._get_level_number(level) - - level_name = count_axis._names[level] - level_index = count_axis.levels[level]._rename(name=level_name) - level_codes = ensure_platform_int(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) - - if axis == 1: - result = self._constructor(counts, index=agg_axis, columns=level_index) - else: - result = self._constructor(counts, index=level_index, columns=agg_axis) - - return result - - @validate_bool_kwargs_from_keywords('numeric_only') - def _reduce( - self, - op, - name: str, - *, - axis: Axis = 0, - skipna: bool = True, - numeric_only: bool | None = None, - filter_type=None, - **kwds, - ): - - assert filter_type is None or filter_type == "bool", filter_type - out_dtype = "bool" if filter_type == "bool" else None - - if numeric_only is None and name in ["mean", "median"]: - own_dtypes = [arr.dtype for arr in self._mgr.arrays] - - dtype_is_dt = np.array( - [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], - dtype=bool, - ) - if dtype_is_dt.any(): - warnings.warn( - "DataFrame.mean and DataFrame.median with numeric_only=None " - "will include datetime64 and datetime64tz columns in a " - "future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - # Non-copy equivalent to - # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype) - # cols = self.columns[~dt64_cols] - # self = self[cols] - predicate = lambda x: not is_datetime64_any_dtype(x.dtype) - mgr = self._mgr._get_data_subset(predicate) - self = type(self)(mgr) - - # TODO: Make other agg func handle axis=None properly GH#21597 - axis = self._get_axis_number(axis) - labels = self._get_agg_axis(axis) - assert axis in [0, 1] - - def func(values: np.ndarray): - # We only use this in the case that operates on self.values - return op(values, axis=axis, skipna=skipna, **kwds) - - def blk_func(values, axis=1): - if isinstance(values, ExtensionArray): - if not is_1d_only_ea_dtype(values.dtype) and not isinstance( - self._mgr, ArrayManager - ): - return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce(name, skipna=skipna, **kwds) - else: - return op(values, axis=axis, skipna=skipna, **kwds) - - def _get_data() -> DataFrame: - if filter_type is None: - data = self._get_numeric_data() - else: - # GH#25101, GH#24434 - assert filter_type == "bool" - data = self._get_bool_data() - return data - - if numeric_only is not None or axis == 0: - # For numeric_only non-None and axis non-None, we know - # which blocks to use and no try/except is needed. - # For numeric_only=None only the case with axis==0 and no object - # dtypes are unambiguous can be handled with BlockManager.reduce - # Case with EAs see GH#35881 - df = self - if numeric_only is True: - df = _get_data() - if axis == 1: - df = df.T - axis = 0 - - ignore_failures = numeric_only is None - - # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager.reduce - res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) - out = df._constructor(res).iloc[0] - if out_dtype is not None: - out = out.astype(out_dtype) - if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) - - if numeric_only is None and out.shape[0] != df.shape[1]: - # columns have been dropped GH#41480 - arg_name = "numeric_only" - if name in ["all", "any"]: - arg_name = "bool_only" - warnings.warn( - "Dropping of nuisance columns in DataFrame reductions " - f"(with '{arg_name}=None') is deprecated; in a future " - "version this will raise TypeError. Select only valid " - "columns before calling the reduction.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return out - - assert numeric_only is None - - data = self - values = data.values - - try: - result = func(values) - - except TypeError: - # e.g. in nanops trying to convert strs to float - - data = _get_data() - labels = data._get_agg_axis(axis) - - values = data.values - with np.errstate(all="ignore"): - result = func(values) - - # columns have been dropped GH#41480 - arg_name = "numeric_only" - if name in ["all", "any"]: - arg_name = "bool_only" - warnings.warn( - "Dropping of nuisance columns in DataFrame reductions " - f"(with '{arg_name}=None') is deprecated; in a future " - "version this will raise TypeError. Select only valid " - "columns before calling the reduction.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if hasattr(result, "dtype"): - if filter_type == "bool" and notna(result).all(): - result = result.astype(np.bool_) - elif filter_type is None and is_object_dtype(result.dtype): - try: - result = result.astype(np.float64) - except (ValueError, TypeError): - # try to coerce to the original dtypes item by item if we can - pass - - result = self._constructor_sliced(result, index=labels) - return result - - @validate_bool_kwargs_from_keywords('skipna') - def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: - """ - Special case for _reduce to try to avoid a potentially-expensive transpose. - - Apply the reduction block-wise along axis=1 and then reduce the resulting - 1D arrays. - """ - if name == "all": - result = np.ones(len(self), dtype=bool) - ufunc = np.logical_and - elif name == "any": - result = np.zeros(len(self), dtype=bool) - # error: Incompatible types in assignment - # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], - # Literal[20], Literal[False]]", variable has type - # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], - # Literal[True]]") - ufunc = np.logical_or # type: ignore[assignment] - else: - raise NotImplementedError(name) - - for arr in self._mgr.arrays: - middle = func(arr, axis=0, skipna=skipna) - result = ufunc(result, middle) - - res_ser = self._constructor_sliced(result, index=self.index) - return res_ser - - @validate_bool_kwargs_from_keywords('dropna') - def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: - """ - Count number of distinct elements in specified axis. - - Return Series with number of distinct elements. Can ignore NaN - values. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for - column-wise. - dropna : bool, default True - Don't include NaN in the counts. - - Returns - ------- - Series - - See Also - -------- - Series.nunique: Method nunique for Series. - DataFrame.count: Count non-NA cells for each column or row. - - Examples - -------- - >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) - >>> df.nunique() - A 3 - B 2 - dtype: int64 - - >>> df.nunique(axis=1) - 0 1 - 1 2 - 2 2 - dtype: int64 - """ - return self.apply(Series.nunique, axis=axis, dropna=dropna) - - @doc(_shared_docs["idxmin"], numeric_only_default="False") - @validate_bool_kwargs_from_keywords('skipna', 'numeric_only') - def idxmin( - self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False - ) -> Series: - axis = self._get_axis_number(axis) - if numeric_only: - data = self._get_numeric_data() - else: - data = self - - res = data._reduce( - nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values - - # indices will always be np.ndarray since axis is not None and - # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy - - index = data._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return data._constructor_sliced(result, index=data._get_agg_axis(axis)) - - @doc(_shared_docs["idxmax"], numeric_only_default="False") - @validate_bool_kwargs_from_keywords('skipna', 'numeric_only') - def idxmax( - self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False - ) -> Series: - - axis = self._get_axis_number(axis) - if numeric_only: - data = self._get_numeric_data() - else: - data = self - - res = data._reduce( - nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values - - # indices will always be np.ndarray since axis is not None and - # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy - - index = data._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return data._constructor_sliced(result, index=data._get_agg_axis(axis)) - - def _get_agg_axis(self, axis_num: int) -> Index: - """ - Let's be explicit about this. - """ - if axis_num == 0: - return self.columns - elif axis_num == 1: - return self.index - else: - raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - - @validate_bool_kwargs_from_keywords('skipna', 'dropna') - def mode( - self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True - ) -> DataFrame: - """ - Get the mode(s) of each element along the selected axis. - - The mode of a set of values is the value that appears most often. - It can be multiple values. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to iterate over while searching for the mode: - - * 0 or 'index' : get mode of each column - * 1 or 'columns' : get mode of each row. - - numeric_only : bool, default False - If True, only apply to numeric columns. - dropna : bool, default True - Don't consider counts of NaN/NaT. - - Returns - ------- - DataFrame - The modes of each column or row. - - See Also - -------- - Series.mode : Return the highest frequency value in a Series. - Series.value_counts : Return the counts of values in a Series. - - Examples - -------- - >>> df = pd.DataFrame([('bird', 2, 2), - ... ('mammal', 4, np.nan), - ... ('arthropod', 8, 0), - ... ('bird', 2, np.nan)], - ... index=('falcon', 'horse', 'spider', 'ostrich'), - ... columns=('species', 'legs', 'wings')) - >>> df - species legs wings - falcon bird 2 2.0 - horse mammal 4 NaN - spider arthropod 8 0.0 - ostrich bird 2 NaN - - By default, missing values are not considered, and the mode of wings - are both 0 and 2. Because the resulting DataFrame has two rows, - the second row of ``species`` and ``legs`` contains ``NaN``. - - >>> df.mode() - species legs wings - 0 bird 2.0 0.0 - 1 NaN NaN 2.0 - - Setting ``dropna=False`` ``NaN`` values are considered and they can be - the mode (like for wings). - - >>> df.mode(dropna=False) - species legs wings - 0 bird 2 NaN - - Setting ``numeric_only=True``, only the mode of numeric columns is - computed, and columns of other types are ignored. - - >>> df.mode(numeric_only=True) - legs wings - 0 2.0 0.0 - 1 NaN 2.0 - - To compute the mode over columns and not rows, use the axis parameter: - - >>> df.mode(axis='columns', numeric_only=True) - 0 1 - falcon 2.0 NaN - horse 4.0 NaN - spider 0.0 8.0 - ostrich 2.0 NaN - """ - data = self if not numeric_only else self._get_numeric_data() - - def f(s): - return s.mode(dropna=dropna) - - data = data.apply(f, axis=axis) - # Ensure index is type stable (should always use int index) - if data.empty: - data.index = default_index(0) - - return data - - def quantile( - self, - q=0.5, - axis: Axis = 0, - numeric_only: bool | lib.NoDefault = no_default, - interpolation: str = "linear", - ): - """ - Return values at the given quantile over requested axis. - - Parameters - ---------- - q : float or array-like, default 0.5 (50% quantile) - Value between 0 <= q <= 1, the quantile(s) to compute. - axis : {0, 1, 'index', 'columns'}, default 0 - Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - numeric_only : bool, default True - If False, the quantile of datetime and timedelta data will be - computed as well. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points `i` and `j`: - - * linear: `i + (j - i) * fraction`, where `fraction` is the - fractional part of the index surrounded by `i` and `j`. - * lower: `i`. - * higher: `j`. - * nearest: `i` or `j` whichever is nearest. - * midpoint: (`i` + `j`) / 2. - - Returns - ------- - Series or DataFrame - - If ``q`` is an array, a DataFrame will be returned where the - index is ``q``, the columns are the columns of self, and the - values are the quantiles. - If ``q`` is a float, a Series will be returned where the - index is the columns of self and the values are the quantiles. - - See Also - -------- - core.window.Rolling.quantile: Rolling quantile. - numpy.percentile: Numpy function to compute the percentile. - - Examples - -------- - >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), - ... columns=['a', 'b']) - >>> df.quantile(.1) - a 1.3 - b 3.7 - Name: 0.1, dtype: float64 - >>> df.quantile([.1, .5]) - a b - 0.1 1.3 3.7 - 0.5 2.5 55.0 - - Specifying `numeric_only=False` will also compute the quantile of - datetime and timedelta data. - - >>> df = pd.DataFrame({'A': [1, 2], - ... 'B': [pd.Timestamp('2010'), - ... pd.Timestamp('2011')], - ... 'C': [pd.Timedelta('1 days'), - ... pd.Timedelta('2 days')]}) - >>> df.quantile(0.5, numeric_only=False) - A 1.5 - B 2010-07-02 12:00:00 - C 1 days 12:00:00 - Name: 0.5, dtype: object - """ - validate_percentile(q) - axis = self._get_axis_number(axis) - any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes) - if numeric_only is no_default and any_not_numeric: - warnings.warn( - "In future versions of pandas, numeric_only will be set to " - "False by default, and the datetime/timedelta columns will " - "be considered in the results. To not consider these columns" - "specify numeric_only=True.", - FutureWarning, - stacklevel=find_stack_level(), - ) - numeric_only = True - - if not is_list_like(q): - # BlockManager.quantile expects listlike, so we wrap and unwrap here - res_df = self.quantile( - [q], axis=axis, numeric_only=numeric_only, interpolation=interpolation - ) - res = res_df.iloc[0] - if axis == 1 and len(self) == 0: - # GH#41544 try to get an appropriate dtype - dtype = find_common_type(list(self.dtypes)) - if needs_i8_conversion(dtype): - return res.astype(dtype) - return res - - q = Index(q, dtype=np.float64) - data = self._get_numeric_data() if numeric_only else self - - if axis == 1: - data = data.T - - if len(data.columns) == 0: - # GH#23925 _get_numeric_data may have dropped all columns - cols = Index([], name=self.columns.name) - - dtype = np.float64 - if axis == 1: - # GH#41544 try to get an appropriate dtype - cdtype = find_common_type(list(self.dtypes)) - if needs_i8_conversion(cdtype): - dtype = cdtype - - if is_list_like(q): - res = self._constructor([], index=q, columns=cols, dtype=dtype) - return res.__finalize__(self, method="quantile") - return self._constructor_sliced([], index=cols, name=q, dtype=dtype) - - res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) - - result = self._constructor(res) - return result.__finalize__(self, method="quantile") - - @doc(NDFrame.asfreq, **_shared_doc_kwargs) - @validate_bool_kwargs_from_keywords('normalize') - def asfreq( - self, - freq: Frequency, - method=None, - how: str | None = None, - normalize: bool = False, - fill_value=None, - ) -> DataFrame: - return super().asfreq( - freq=freq, - method=method, - how=how, - normalize=normalize, - fill_value=fill_value, - ) - - @doc(NDFrame.resample, **_shared_doc_kwargs) - def resample( - self, - rule, - axis=0, - closed: str | None = None, - label: str | None = None, - convention: str = "start", - kind: str | None = None, - loffset=None, - base: int | None = None, - on=None, - level=None, - origin: str | TimestampConvertibleTypes = "start_day", - offset: TimedeltaConvertibleTypes | None = None, - group_keys: bool | lib.NoDefault = no_default, - ) -> Resampler: - return super().resample( - rule=rule, - axis=axis, - closed=closed, - label=label, - convention=convention, - kind=kind, - loffset=loffset, - base=base, - on=on, - level=level, - origin=origin, - offset=offset, - group_keys=group_keys, - ) - - @validate_bool_kwargs_from_keywords('copy') - def to_timestamp( - self, - freq: Frequency | None = None, - how: str = "start", - axis: Axis = 0, - copy: bool = True, - ) -> DataFrame: - """ - Cast to DatetimeIndex of timestamps, at *beginning* of period. - - Parameters - ---------- - freq : str, default frequency of PeriodIndex - Desired frequency. - how : {'s', 'e', 'start', 'end'} - Convention for converting period to timestamp; start of period - vs. end. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to convert (the index by default). - copy : bool, default True - If False then underlying input data is not copied. - - Returns - ------- - DataFrame with DatetimeIndex - """ - new_obj = self.copy(deep=copy) - - axis_name = self._get_axis_name(axis) - old_ax = getattr(self, axis_name) - if not isinstance(old_ax, PeriodIndex): - raise TypeError(f"unsupported Type {type(old_ax).__name__}") - - new_ax = old_ax.to_timestamp(freq=freq, how=how) - - setattr(new_obj, axis_name, new_ax) - return new_obj - - @validate_bool_kwargs_from_keywords('copy') - def to_period( - self, freq: Frequency | None = None, axis: Axis = 0, copy: bool = True - ) -> DataFrame: - """ - Convert DataFrame from DatetimeIndex to PeriodIndex. - - Convert DataFrame from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed). - - Parameters - ---------- - freq : str, default - Frequency of the PeriodIndex. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to convert (the index by default). - copy : bool, default True - If False then underlying input data is not copied. - - Returns - ------- - DataFrame with PeriodIndex - - Examples - -------- - >>> idx = pd.to_datetime( - ... [ - ... "2001-03-31 00:00:00", - ... "2002-05-31 00:00:00", - ... "2003-08-31 00:00:00", - ... ] - ... ) - - >>> idx - DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[ns]', freq=None) - - >>> idx.to_period("M") - PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') - - For the yearly frequency - - >>> idx.to_period("Y") - PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') - """ - new_obj = self.copy(deep=copy) - - axis_name = self._get_axis_name(axis) - old_ax = getattr(self, axis_name) - if not isinstance(old_ax, DatetimeIndex): - raise TypeError(f"unsupported Type {type(old_ax).__name__}") - - new_ax = old_ax.to_period(freq=freq) - - setattr(new_obj, axis_name, new_ax) - return new_obj - - def isin(self, values) -> DataFrame: - """ - Whether each element in the DataFrame is contained in values. - - Parameters - ---------- - values : iterable, Series, DataFrame or dict - The result will only be true at a location if all the - labels match. If `values` is a Series, that's the index. If - `values` is a dict, the keys must be the column names, - which must match. If `values` is a DataFrame, - then both the index and column labels must match. - - Returns - ------- - DataFrame - DataFrame of booleans showing whether each element in the DataFrame - is contained in values. - - See Also - -------- - DataFrame.eq: Equality test for DataFrame. - Series.isin: Equivalent method on Series. - Series.str.contains: Test if pattern or regex is contained within a - string of a Series or Index. - - Examples - -------- - >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, - ... index=['falcon', 'dog']) - >>> df - num_legs num_wings - falcon 2 2 - dog 4 0 - - When ``values`` is a list check whether every value in the DataFrame - is present in the list (which animals have 0 or 2 legs or wings) - - >>> df.isin([0, 2]) - num_legs num_wings - falcon True True - dog False True - - To check if ``values`` is *not* in the DataFrame, use the ``~`` operator: - - >>> ~df.isin([0, 2]) - num_legs num_wings - falcon False False - dog True False - - When ``values`` is a dict, we can pass values to check for each - column separately: - - >>> df.isin({'num_wings': [0, 3]}) - num_legs num_wings - falcon False False - dog False True - - When ``values`` is a Series or DataFrame the index and column must - match. Note that 'falcon' does not match based on the number of legs - in other. - - >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, - ... index=['spider', 'falcon']) - >>> df.isin(other) - num_legs num_wings - falcon False True - dog False False - """ - if isinstance(values, dict): - from pandas.core.reshape.concat import concat - - values = collections.defaultdict(list, values) - result = concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), - axis=1, - ) - elif isinstance(values, Series): - if not values.index.is_unique: - raise ValueError("cannot compute isin with a duplicate axis.") - result = self.eq(values.reindex_like(self), axis="index") - elif isinstance(values, DataFrame): - if not (values.columns.is_unique and values.index.is_unique): - raise ValueError("cannot compute isin with a duplicate axis.") - result = self.eq(values.reindex_like(self)) - else: - if not is_list_like(values): - raise TypeError( - "only list-like or dict-like objects are allowed " - "to be passed to DataFrame.isin(), " - f"you passed a '{type(values).__name__}'" - ) - result = self._constructor( - algorithms.isin(self.values.ravel(), values).reshape(self.shape), - self.index, - self.columns, - ) - return result.__finalize__(self, method="isin") - - # ---------------------------------------------------------------------- - # Add index and columns - _AXIS_ORDERS = ["index", "columns"] - _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { - **NDFrame._AXIS_TO_AXIS_NUMBER, - 1: 1, - "columns": 1, - } - _AXIS_LEN = len(_AXIS_ORDERS) - _info_axis_number = 1 - _info_axis_name = "columns" - - index: Index = properties.AxisProperty( - axis=1, doc="The index (row labels) of the DataFrame." - ) - columns: Index = properties.AxisProperty( - axis=0, doc="The column labels of the DataFrame." - ) - - @property - def _AXIS_NUMBERS(self) -> dict[str, int]: - """.. deprecated:: 1.1.0""" - super()._AXIS_NUMBERS - return {"index": 0, "columns": 1} - - @property - def _AXIS_NAMES(self) -> dict[int, str]: - """.. deprecated:: 1.1.0""" - super()._AXIS_NAMES - return {0: "index", 1: "columns"} - - # ---------------------------------------------------------------------- - # Add plotting methods to DataFrame - plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) - hist = pandas.plotting.hist_frame - boxplot = pandas.plotting.boxplot_frame - sparse = CachedAccessor("sparse", SparseFrameAccessor) - - # ---------------------------------------------------------------------- - # Internal Interface Methods - - @validate_bool_kwargs_from_keywords('copy') - def _to_dict_of_blocks(self, copy: bool = True): - """ - Return a dict of dtype -> Constructor Types that - each is a homogeneous dtype. - - Internal ONLY - only works for BlockManager - """ - mgr = self._mgr - # convert to BlockManager if needed -> this way support ArrayManager as well - mgr = mgr_to_mgr(mgr, "block") - mgr = cast(BlockManager, mgr) - return { - k: self._constructor(v).__finalize__(self) - for k, v, in mgr.to_dict(copy=copy).items() - } - - @property - def values(self) -> np.ndarray: - """ - Return a Numpy representation of the DataFrame. - - .. warning:: - - We recommend using :meth:`DataFrame.to_numpy` instead. - - Only the values in the DataFrame will be returned, the axes labels - will be removed. - - Returns - ------- - numpy.ndarray - The values of the DataFrame. - - See Also - -------- - DataFrame.to_numpy : Recommended alternative to this method. - DataFrame.index : Retrieve the index labels. - DataFrame.columns : Retrieving the column names. - - Notes - ----- - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcast to - int32. By :func:`numpy.find_common_type` convention, mixing int64 - and uint64 will result in a float64 dtype. - - Examples - -------- - A DataFrame where all columns are the same type (e.g., int64) results - in an array of the same type. - - >>> df = pd.DataFrame({'age': [ 3, 29], - ... 'height': [94, 170], - ... 'weight': [31, 115]}) - >>> df - age height weight - 0 3 94 31 - 1 29 170 115 - >>> df.dtypes - age int64 - height int64 - weight int64 - dtype: object - >>> df.values - array([[ 3, 94, 31], - [ 29, 170, 115]]) - - A DataFrame with mixed type columns(e.g., str/object, int64, float32) - results in an ndarray of the broadest type that accommodates these - mixed types (e.g., object). - - >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), - ... ('lion', 80.5, 1), - ... ('monkey', np.nan, None)], - ... columns=('name', 'max_speed', 'rank')) - >>> df2.dtypes - name object - max_speed float64 - rank object - dtype: object - >>> df2.values - array([['parrot', 24.0, 'second'], - ['lion', 80.5, 1], - ['monkey', nan, None]], dtype=object) - """ - self._consolidate_inplace() - return self._mgr.as_array() - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) - @validate_bool_kwargs_from_keywords('inplace') - def ffill( - self: DataFrame, - axis: None | Axis = None, - inplace: bool = False, - limit: None | int = None, - downcast=None, - ) -> DataFrame | None: - return super().ffill(axis, inplace, limit, downcast) - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) - @validate_bool_kwargs_from_keywords('inplace') - def bfill( - self: DataFrame, - axis: None | Axis = None, - inplace: bool = False, - limit: None | int = None, - downcast=None, - ) -> DataFrame | None: - return super().bfill(axis, inplace, limit, downcast) - - @deprecate_nonkeyword_arguments( - version=None, allowed_args=["self", "lower", "upper"] - ) - @validate_bool_kwargs_from_keywords('inplace') - def clip( - self: DataFrame, - lower=None, - upper=None, - axis: Axis | None = None, - inplace: bool = False, - *args, - **kwargs, - ) -> DataFrame | None: - return super().clip(lower, upper, axis, inplace, *args, **kwargs) - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) - @validate_bool_kwargs_from_keywords('inplace') - def interpolate( - self: DataFrame, - method: str = "linear", - axis: Axis = 0, - limit: int | None = None, - inplace: bool = False, - limit_direction: str | None = None, - limit_area: str | None = None, - downcast: str | None = None, - **kwargs, - ) -> DataFrame | None: - return super().interpolate( - method, - axis, - limit, - inplace, - limit_direction, - limit_area, - downcast, - **kwargs, - ) - - @deprecate_nonkeyword_arguments( - version=None, allowed_args=["self", "cond", "other"] - ) - def where( - self, - cond, - other=lib.no_default, - inplace=False, - axis=None, - level=None, - errors: IgnoreRaise = "raise", - try_cast=lib.no_default, - ): - return super().where(cond, other, inplace, axis, level, errors, try_cast) - - @deprecate_nonkeyword_arguments( - version=None, allowed_args=["self", "cond", "other"] - ) - def mask( - self, - cond, - other=np.nan, - inplace=False, - axis=None, - level=None, - errors: IgnoreRaise = "raise", - try_cast=lib.no_default, - ): - return super().mask(cond, other, inplace, axis, level, errors, try_cast) - - -DataFrame._add_numeric_operations() - -ops.add_flex_arithmetic_methods(DataFrame) - - -def _from_nested_dict(data) -> collections.defaultdict: - new_data: collections.defaultdict = collections.defaultdict(dict) - for index, s in data.items(): - for col, v in s.items(): - new_data[col][index] = v - return new_data - - -def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: - # reindex if necessary - - if value.index.equals(index) or not len(index): - return value._values.copy() - - # GH#4107 - try: - reindexed_value = value.reindex(index)._values - except ValueError as err: - # raised in MultiIndex.from_tuples, see test_insert_error_msmgs - if not value.index.is_unique: - # duplicate axis - raise err - - raise TypeError( - "incompatible index of inserted column with frame index" - ) from err - return reindexed_value From 2b43b3c9f90f9c883a622d34fa0e94f7346485b8 Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 02:03:26 -0600 Subject: [PATCH 3/8] added validate_bool_kwargs_from_keywords --- pandas/util/_validators.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 8e3de9404fbee..af0be30ab2edd 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -12,6 +12,8 @@ import numpy as np +import functools + from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -518,3 +520,39 @@ def validate_insert_loc(loc: int, length: int) -> int: if not 0 <= loc <= length: raise IndexError(f"loc must be an integer between -{length} and {length}") return loc + +def validate_bool_kwargs_from_keywords(*keywords): + """ + Takes keywords and ensures all are type bool, using validate_bool_kwarg + + Example Usage: + @validate_bool_kwargs_from_keywords('copy', 'inplace') + def method(##that takes bool kwargs## copy: bool = False, inplace: bool = False): + + Used as a decorator above methods. Uses functools. + + validate_bool_kwarg: + def validate_bool_kwarg(value, arg_name, none_allowed=True, int_allowed=False): + good_value = is_bool(value) + if none_allowed: + good_value = good_value or value is None + + if int_allowed: + good_value = good_value or isinstance(value, int) + + if not good_value: + raise ValueError( + f'For argument "{arg_name}" expected type bool, received ' + f"type {type(value).__name__}." + ) + return value + """ + words = set(keywords) + def validate_bool_kwargs_from_keywords_inner(func): + @functools.wraps(func) + def validator(*args, **kwargs): + for word in words.intersection(kwargs.keys()): + validate_bool_kwarg(kwargs[kw], kw) + return func(*args, **kwargs) + return validator + return validate_bool_kwargs_from_keywords_inner \ No newline at end of file From 05d134789d7b5a782e9383f146f9154762237c75 Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 02:04:13 -0600 Subject: [PATCH 4/8] added test for validate_bool_kwargs_from_keywords --- pandas/tests/series/test_validate.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/series/test_validate.py b/pandas/tests/series/test_validate.py index 3c867f7582b7d..843f460649fb0 100644 --- a/pandas/tests/series/test_validate.py +++ b/pandas/tests/series/test_validate.py @@ -24,3 +24,10 @@ def test_validate_bool_args(string_series, func, inplace): with pytest.raises(ValueError, match=msg): getattr(string_series, func)(**kwargs) + +@pytest.mark.parametrize('keyword', ('copy', 'fastpath', 'takeable', 'clear', 'verify_is_copy', 'inplace', 'allow_duplicates', 'index', 'as_index', 'sort', 'observed', 'dropna', 'ignore_index', 'verify_integrity', 'keep_shape', 'keep_equal', 'inplace', 'sort_remaining' , 'convert_dtype', 'show_counts', 'deep', 'infer_objects', 'convert_string', 'convert_integer', 'convert_boolean', 'convert_floating', 'normalize')) +def test_set_index_validation(string_series, func, keyword): + msg = 'For argument "{}" expected type bool'.format(keyword) + kwargs = {keyword: 'hello'} + with pytest.raises(ValueError, match=msg): + getattr(string_series, func)(**kwargs) \ No newline at end of file From 4c5bc99215a00630ec4b6f1ba8d6096881eeeabf Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 02:04:47 -0600 Subject: [PATCH 5/8] added test for validate_bool_kwargs_from_keywords --- pandas/tests/frame/test_validate.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/frame/test_validate.py b/pandas/tests/frame/test_validate.py index e99e0a6863848..02dc0cf8e2605 100644 --- a/pandas/tests/frame/test_validate.py +++ b/pandas/tests/frame/test_validate.py @@ -39,3 +39,10 @@ def test_validate_bool_args(self, dataframe, func, inplace): with pytest.raises(ValueError, match=msg): getattr(dataframe, func)(**kwargs) + +@pytest.mark.parametrize('keyword', ('nan_as_null', 'allow_copy', 'ignore_width', 'index', 'index_names', 'show_dimensions', 'copy', 'inplace', 'reauth', 'auth_local_webserver', 'progress_bar', 'verify_integrity', 'write_index', 'bold_rows', 'escape', 'notebook', 'render_links', 'deep', 'takeable', 'drop', 'append', 'ignore_index', 'sort_remaining', 'normalize', 'ascending', 'dropna', 'keep_shape', 'keep_equal', 'overwrite', 'as_index', 'observed', 'sort', 'raw', 'left_index', 'right_index', 'numeric_only', 'skipna')) +def test_set_index_validation(dataframe, func, keyword): + msg = 'For argument "{}" expected type bool'.format(keyword) + kwargs = {keyword: 'hello'} + with pytest.raises(ValueError, match=msg): + getattr(dataframe, func)(**kwargs) \ No newline at end of file From 3cfff6cee0b3bdfae2fc8e725d7c0bbb4f5cfd46 Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 02:06:20 -0600 Subject: [PATCH 6/8] ENH: validate_bool_kwargs_from_keywords for kwargs --- pandas/core/frame.py | 62 ++++++++++++++++++++++++++++++++++++++++++- pandas/core/series.py | 29 ++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ef5e6dd1d6757..073df7b512b88 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -95,6 +95,7 @@ validate_axis_style_args, validate_bool_kwarg, validate_percentile, + validate_bool_kwargs_from_keywords, ) from pandas.core.dtypes.cast import ( @@ -816,6 +817,7 @@ def __init__( NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- + @validate_bool_kwargs_from_keywords('nan_as_null', 'allow_copy') def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ) -> DataFrameXchg: @@ -986,6 +988,7 @@ def _repr_fits_vertical_(self) -> bool: max_rows = get_option("display.max_rows") return len(self) <= max_rows + @validate_bool_kwargs_from_keywords('ignore_width') def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: """ Check if full repr fits in horizontal boundaries imposed by the display @@ -1165,6 +1168,7 @@ def to_string( "references the column, while the value defines the space to use.", ) @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + @validate_bool_kwargs_from_keywords('index', 'index_names', 'show_dimensions') def to_string( self, buf: FilePath | WriteBuffer[str] | None = None, @@ -1379,7 +1383,7 @@ def iterrows(self) -> Iterable[tuple[Hashable, Series]]: for k, v in zip(self.index, self.values): s = klass(v, index=columns, name=k).__finalize__(self) yield k, s - + @validate_bool_kwargs_from_keywords('index') def itertuples( self, index: bool = True, name: str | None = "Pandas" ) -> Iterable[tuple[Any, ...]]: @@ -1740,6 +1744,7 @@ def create_index(indexlist, namelist): columns = create_index(data["columns"], data["column_names"]) return cls(realdata, index=index, columns=columns, dtype=dtype) + @validate_bool_kwargs_from_keywords('copy') def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -2011,6 +2016,7 @@ def to_dict(self, orient: str = "dict", into=dict): else: raise ValueError(f"orient '{orient}' not understood") + @validate_bool_kwargs_from_keywords('reauth', 'auth_local_webserver', 'progress_bar') def to_gbq( self, destination_table: str, @@ -2496,6 +2502,7 @@ def to_records( return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod + @validate_bool_kwargs_from_keywords('verify_integrity') def _from_arrays( cls, arrays, @@ -2550,6 +2557,7 @@ def _from_arrays( compression_options=_shared_docs["compression_options"] % "path", ) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + @validate_bool_kwargs_from_keywords('write_index') def to_stata( self, path: FilePath | WriteBuffer[bytes], @@ -2911,6 +2919,7 @@ def to_parquet( " Ability to use str", ) @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + @validate_bool_kwargs_from_keywords('index', 'index_names', 'bold_rows', 'escape', 'notebook', 'render_links') def to_html( self, buf: FilePath | WriteBuffer[str] | None = None, @@ -3001,6 +3010,7 @@ def to_html( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", ) + @validate_bool_kwargs_from_keywords('index') def to_xml( self, path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, @@ -3233,6 +3243,7 @@ def info( show_counts=show_counts, ) + @validate_bool_kwargs_from_keywords('index', 'deep') def memory_usage(self, index: bool = True, deep: bool = False) -> Series: """ Return the memory usage of each column in bytes. @@ -3333,6 +3344,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: result = index_memory_usage._append(result) return result + @validate_bool_kwargs_from_keywords('copy') def transpose(self, *args, copy: bool = False) -> DataFrame: """ Transpose index and columns. @@ -3652,6 +3664,7 @@ def _getitem_multilevel(self, key): # loc is neither a slice nor ndarray, so must be an int return self._ixs(loc, axis=1) + @validate_bool_kwargs_from_keywords('takeable') def _get_value(self, index, col, takeable: bool = False) -> Scalar: """ Quickly retrieve single value at passed column and index. @@ -4004,6 +4017,7 @@ def _reset_cacher(self) -> None: # no-op for DataFrame pass + @validate_bool_kwargs_from_keywords('inplace') def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: """ The object has called back to us saying maybe it has changed. @@ -4021,6 +4035,7 @@ def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: # ---------------------------------------------------------------------- # Unsorted + @validate_bool_kwargs_from_keywords('inplace') def query(self, expr: str, inplace: bool = False, **kwargs): """ Query the columns of a DataFrame with a boolean expression. @@ -4186,6 +4201,7 @@ def query(self, expr: str, inplace: bool = False, **kwargs): else: return result + @validate_bool_kwargs_from_keywords('inplace') def eval(self, expr: str, inplace: bool = False, **kwargs): """ Evaluate a string describing operations on DataFrame columns. @@ -4758,6 +4774,7 @@ def _reindex_multi( ) @doc(NDFrame.align, **_shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('copy') def align( self, other, @@ -4844,6 +4861,7 @@ def set_axis( see_also_sub=" or columns", ) @Appender(NDFrame.set_axis.__doc__) + @validate_bool_kwargs_from_keywords('inplace') def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return super().set_axis(labels, axis=axis, inplace=inplace) @@ -4913,6 +4931,7 @@ def drop( # error: Signature of "drop" incompatible with supertype "NDFrame" # github.com/python/mypy/issues/12387 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) + @validate_bool_kwargs_from_keywords('inplace') def drop( # type: ignore[override] self, labels: Hashable | list[Hashable] = None, @@ -5115,6 +5134,7 @@ def rename( ) -> DataFrame | None: ... + @validate_bool_kwargs_from_keywords('inplace', 'copy') def rename( self, mapper: Renamer | None = None, @@ -5362,6 +5382,7 @@ def fillna( @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) @doc(NDFrame.fillna, **_shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('inplace') def fillna( self, value: object | ArrayLike | None = None, @@ -5423,6 +5444,7 @@ def pop(self, item: Hashable) -> Series: """ return super().pop(item=item) + @validate_bool_kwargs_from_keywords('inplace') @doc(NDFrame.replace, **_shared_doc_kwargs) def replace( self, @@ -5559,6 +5581,7 @@ def shift( ) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"]) + @validate_bool_kwargs_from_keywords('inplace', 'drop', 'append', 'verify_integrity') def set_index( self, keys, @@ -5838,6 +5861,7 @@ def reset_index( ... @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) + @validate_bool_kwargs_from_keywords('inplace', 'drop') def reset_index( self, level: Hashable | Sequence[Hashable] | None = None, @@ -6110,6 +6134,7 @@ def notnull(self) -> DataFrame: return ~self.isna() @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') def dropna( self, axis: Axis = 0, @@ -6273,6 +6298,7 @@ def dropna( return result @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) + @validate_bool_kwargs_from_keywords('inplace', 'ignore_index') def drop_duplicates( self, subset: Hashable | Sequence[Hashable] | None = None, @@ -6519,6 +6545,7 @@ def f(vals) -> tuple[np.ndarray, int]: @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"]) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) + @validate_bool_kwargs_from_keywords('inplace', 'ignore_index') # error: Signature of "sort_values" incompatible with supertype "NDFrame" def sort_values( # type: ignore[override] self, @@ -6642,6 +6669,7 @@ def sort_index( # error: Signature of "sort_index" incompatible with supertype "NDFrame" @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace', 'sort_remaining', 'ignore_index') def sort_index( # type: ignore[override] self, axis: Axis = 0, @@ -6755,6 +6783,7 @@ def sort_index( # type: ignore[override] key=key, ) + @validate_bool_kwargs_from_keywords('normalize', 'sort', 'ascending', 'dropna') def value_counts( self, subset: Sequence[Hashable] | None = None, @@ -7482,6 +7511,7 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: """, klass=_shared_doc_kwargs["klass"], ) + @validate_bool_kwargs_from_keywords('keep_shape', 'keep_equal') def compare( self, other: DataFrame, @@ -7496,6 +7526,7 @@ def compare( keep_equal=keep_equal, ) + @validate_bool_kwargs_from_keywords('overwrite') def combine( self, other: DataFrame, func, fill_value=None, overwrite: bool = True ) -> DataFrame: @@ -7741,6 +7772,7 @@ def combiner(x, y): return combined + @validate_bool_kwargs_from_keywords('overwrite') def update( self, other, @@ -8000,6 +8032,7 @@ def update( """ ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('as_index', 'sort', 'observed', 'dropna') def groupby( self, by=None, @@ -8373,6 +8406,7 @@ def pivot_table( sort=sort, ) + @validate_bool_kwargs_from_keywords('dropna') def stack(self, level: Level = -1, dropna: bool = True): """ Stack the prescribed level(s) from columns to index. @@ -8548,6 +8582,7 @@ def stack(self, level: Level = -1, dropna: bool = True): return result.__finalize__(self, method="stack") + @validate_bool_kwargs_from_keywords('ignore_index') def explode( self, column: IndexLabel, @@ -9003,6 +9038,7 @@ def transform( assert isinstance(result, DataFrame) return result + @validate_bool_kwargs_from_keywords('raw') def apply( self, func: AggFuncType, @@ -9250,6 +9286,7 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods + @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity', 'sort') def append( self, other, @@ -9362,6 +9399,7 @@ def append( return self._append(other, ignore_index, verify_integrity, sort) + @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity', 'sort') def _append( self, other, @@ -9421,6 +9459,7 @@ def _append( result = result.reindex(combined_columns, axis=1) return result.__finalize__(self, method="append") + @validate_bool_kwargs_from_keywords('sort') def join( self, other: DataFrame | Series, @@ -9594,6 +9633,7 @@ def join( validate=validate, ) + @validate_bool_kwargs_from_keywords('sort') def _join_compat( self, other: DataFrame | Series, @@ -9677,6 +9717,7 @@ def _join_compat( @Substitution("") @Appender(_merge_doc, indents=2) + @validate_bool_kwargs_from_keywords('left_index', 'right_index', 'copy', 'indicator', 'sort') def merge( self, right: DataFrame | Series, @@ -9829,6 +9870,7 @@ def _series_round(ser: Series, decimals: int): # ---------------------------------------------------------------------- # Statistical methods, etc. + @validate_bool_kwargs_from_keywords('numeric_only') def corr( self, method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", @@ -9942,6 +9984,7 @@ def corr( return self._constructor(correl, index=idx, columns=cols) + @validate_bool_kwargs_from_keywords('numeric_only') def cov( self, min_periods: int | None = None, @@ -10071,6 +10114,7 @@ def cov( return self._constructor(base_cov, index=idx, columns=cols) + @validate_bool_kwargs_from_keywords('numeric_only') def corrwith( self, other, @@ -10235,6 +10279,7 @@ def c(x): # ---------------------------------------------------------------------- # ndarray-like stats methods + @validate_bool_kwargs_from_keywords('numeric_only') def count( self, axis: Axis = 0, level: Level | None = None, numeric_only: bool = False ): @@ -10340,6 +10385,7 @@ def count( return result.astype("int64").__finalize__(self, method="count") + @validate_bool_kwargs_from_keywords('numeric_only') def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): if numeric_only: frame = self._get_numeric_data() @@ -10385,6 +10431,7 @@ def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): return result + @validate_bool_kwargs_from_keywords('numeric_only') def _reduce( self, op, @@ -10537,6 +10584,7 @@ def _get_data() -> DataFrame: result = self._constructor_sliced(result, index=labels) return result + @validate_bool_kwargs_from_keywords('skipna') def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: """ Special case for _reduce to try to avoid a potentially-expensive transpose. @@ -10565,6 +10613,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: res_ser = self._constructor_sliced(result, index=self.index) return res_ser + @validate_bool_kwargs_from_keywords('dropna') def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ Count number of distinct elements in specified axis. @@ -10606,6 +10655,7 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: return self.apply(Series.nunique, axis=axis, dropna=dropna) @doc(_shared_docs["idxmin"], numeric_only_default="False") + @validate_bool_kwargs_from_keywords('skipna', 'numeric_only') def idxmin( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: @@ -10630,6 +10680,7 @@ def idxmin( return data._constructor_sliced(result, index=data._get_agg_axis(axis)) @doc(_shared_docs["idxmax"], numeric_only_default="False") + @validate_bool_kwargs_from_keywords('skipna', 'numeric_only') def idxmax( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: @@ -10665,6 +10716,7 @@ def _get_agg_axis(self, axis_num: int) -> Index: else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") + @validate_bool_kwargs_from_keywords('skipna', 'dropna') def mode( self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True ) -> DataFrame: @@ -10884,6 +10936,7 @@ def quantile( return result.__finalize__(self, method="quantile") @doc(NDFrame.asfreq, **_shared_doc_kwargs) + @validate_bool_kwargs_from_keywords('normalize') def asfreq( self, freq: Frequency, @@ -10933,6 +10986,7 @@ def resample( group_keys=group_keys, ) + @validate_bool_kwargs_from_keywords('copy') def to_timestamp( self, freq: Frequency | None = None, @@ -10971,6 +11025,7 @@ def to_timestamp( setattr(new_obj, axis_name, new_ax) return new_obj + @validate_bool_kwargs_from_keywords('copy') def to_period( self, freq: Frequency | None = None, axis: Axis = 0, copy: bool = True ) -> DataFrame: @@ -11170,6 +11225,7 @@ def _AXIS_NAMES(self) -> dict[int, str]: # ---------------------------------------------------------------------- # Internal Interface Methods + @validate_bool_kwargs_from_keywords('copy') def _to_dict_of_blocks(self, copy: bool = True): """ Return a dict of dtype -> Constructor Types that @@ -11264,6 +11320,7 @@ def values(self) -> np.ndarray: return self._mgr.as_array() @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') def ffill( self: DataFrame, axis: None | Axis = None, @@ -11274,6 +11331,7 @@ def ffill( return super().ffill(axis, inplace, limit, downcast) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') def bfill( self: DataFrame, axis: None | Axis = None, @@ -11286,6 +11344,7 @@ def bfill( @deprecate_nonkeyword_arguments( version=None, allowed_args=["self", "lower", "upper"] ) + @validate_bool_kwargs_from_keywords('inplace') def clip( self: DataFrame, lower=None, @@ -11298,6 +11357,7 @@ def clip( return super().clip(lower, upper, axis, inplace, *args, **kwargs) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) + @validate_bool_kwargs_from_keywords('inplace') def interpolate( self: DataFrame, method: str = "linear", diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d3509cac0edd..b84b195ddb2b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -64,6 +64,7 @@ validate_ascending, validate_bool_kwarg, validate_percentile, + validate_bool_kwargs_from_keywords, ) from pandas.core.dtypes.cast import ( @@ -330,6 +331,7 @@ class Series(base.IndexOpsMixin, NDFrame): # ---------------------------------------------------------------------- # Constructors + @validate_bool_kwargs_from_keywords('copy', 'fastpath') def __init__( self, data=None, @@ -1055,6 +1057,7 @@ def _get_values(self, indexer: slice | npt.NDArray[np.bool_]) -> Series: new_mgr = self._mgr.getitem_mgr(indexer) return self._constructor(new_mgr).__finalize__(self) + @validate_bool_kwargs_from_keywords('takeable') def _get_value(self, label, takeable: bool = False): """ Quickly retrieve single value at passed index label. @@ -1201,6 +1204,7 @@ def _set_values(self, key, value) -> None: self._mgr = self._mgr.setitem(indexer=key, value=value) self._maybe_update_cacher() + @validate_bool_kwargs_from_keywords('takeable') def _set_value(self, label, value, takeable: bool = False): """ Quickly set single value at passed label. @@ -1272,6 +1276,7 @@ def _check_is_chained_assignment_possible(self) -> bool: return True return super()._check_is_chained_assignment_possible() + @validate_bool_kwargs_from_keywords('clear', 'verify_is_copy', 'inplace') def _maybe_update_cacher( self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False ) -> None: @@ -1368,6 +1373,7 @@ def repeat(self, repeats, axis=None) -> Series: ) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) + @validate_bool_kwargs_from_keywords('allow_duplicates') def reset_index( self, level=None, @@ -1639,6 +1645,7 @@ def to_string( +----+----------+""" ), ) + @validate_bool_kwargs_from_keywords('index') def to_markdown( self, buf: IO[str] | None = None, @@ -1824,6 +1831,7 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: df = self._constructor_expanddim(mgr) return df.__finalize__(self, method="to_frame") + @validate_bool_kwargs_from_keywords('inplace') def _set_name(self, name, inplace=False) -> Series: """ Set the Series name. @@ -1921,6 +1929,7 @@ def _set_name(self, name, inplace=False) -> Series: Name: Max Speed, dtype: float64 """ ) + @validate_bool_kwargs_from_keywords('as_index', 'sort', 'observed', 'dropna') @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) def groupby( self, @@ -2032,6 +2041,7 @@ def count(self, level=None): self, method="count" ) + @validate_bool_kwargs_from_keywords('dropna') def mode(self, dropna: bool = True) -> Series: """ Return the mode(s) of the Series. @@ -2890,6 +2900,7 @@ def searchsorted( # type: ignore[override] # ------------------------------------------------------------------- # Combination + @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity') def append( self, to_append, ignore_index: bool = False, verify_integrity: bool = False ): @@ -2976,6 +2987,7 @@ def append( return self._append(to_append, ignore_index, verify_integrity) + @validate_bool_kwargs_from_keywords('ignore_index', 'verify_integrity') def _append( self, to_append, ignore_index: bool = False, verify_integrity: bool = False ): @@ -3129,6 +3141,7 @@ def _construct_result( """, klass=_shared_doc_kwargs["klass"], ) + @validate_bool_kwargs_from_keywords('keep_shape', 'keep_equal') def compare( self, other: Series, @@ -3371,6 +3384,7 @@ def update(self, other) -> None: # Reindexing, sorting @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace', 'ignore_index') def sort_values( self, axis=0, @@ -3630,6 +3644,7 @@ def sort_index( # error: Signature of "sort_index" incompatible with supertype "NDFrame" @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace', 'sort_remaining', 'ignore_index') def sort_index( # type: ignore[override] self, axis: Axis = 0, @@ -4132,6 +4147,7 @@ def reorder_levels(self, order) -> Series: result.index = result.index.reorder_levels(order) return result + @validate_bool_kwargs_from_keywords('ignore_index') def explode(self, ignore_index: bool = False) -> Series: """ Transform each element of a list-like to a row. @@ -4412,6 +4428,7 @@ def transform( ).transform() return result + @validate_bool_kwargs_from_keywords('convert_dtype') def apply( self, func: AggFuncType, @@ -4657,6 +4674,7 @@ def rename( ) -> Series | None: ... + @validate_bool_kwargs_from_keywords('copy', 'inplace') def rename( self, index: Renamer | Hashable | None = None, @@ -4789,6 +4807,7 @@ def set_axis(self, labels, axis: Axis = ..., inplace: bool = ...) -> Series | No see_also_sub="", ) @Appender(NDFrame.set_axis.__doc__) + @validate_bool_kwargs_from_keywords('inplace') def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return super().set_axis(labels, axis=axis, inplace=inplace) @@ -4857,6 +4876,7 @@ def drop( # error: Signature of "drop" incompatible with supertype "NDFrame" # github.com/python/mypy/issues/12387 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) + @validate_bool_kwargs_from_keywords('inplace') def drop( # type: ignore[override] self, labels: Hashable | list[Hashable] = None, @@ -5152,6 +5172,7 @@ def replace( ) @doc(INFO_DOCSTRING, **series_sub_kwargs) + @validate_bool_kwargs_from_keywords('show_counts') def info( self, verbose: bool | None = None, @@ -5167,6 +5188,7 @@ def info( show_counts=show_counts, ) + @validate_bool_kwargs_from_keywords('inplace') def _replace_single(self, to_replace, method: str, inplace: bool, limit): """ Replaces values in a Series using the fill method specified when no @@ -5196,6 +5218,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> Series: periods=periods, freq=freq, axis=axis, fill_value=fill_value ) + @validate_bool_kwargs_from_keywords('index', 'deep') def memory_usage(self, index: bool = True, deep: bool = False) -> int: """ Return the memory usage of the Series. @@ -5430,6 +5453,7 @@ def between(self, left, right, inclusive="both") -> Series: # ---------------------------------------------------------------------- # Convert to types that support pd.NA + @validate_bool_kwargs_from_keywords('infer_objects', 'convert_string', 'convert_integer', 'convert_boolean', 'convert_floating') def _convert_dtypes( self, infer_objects: bool = True, @@ -5577,6 +5601,7 @@ def dropna(self, axis=0, inplace=False, how=None): # error: Cannot determine type of 'asfreq' @doc(NDFrame.asfreq, **_shared_doc_kwargs) # type: ignore[has-type] + @validate_bool_kwargs_from_keywords('normalize') def asfreq( self, freq, @@ -5684,6 +5709,7 @@ def to_period(self, freq=None, copy=True) -> Series: ) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') def ffill( self: Series, axis: None | Axis = None, @@ -5694,6 +5720,7 @@ def ffill( return super().ffill(axis, inplace, limit, downcast) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @validate_bool_kwargs_from_keywords('inplace') def bfill( self: Series, axis: None | Axis = None, @@ -5706,6 +5733,7 @@ def bfill( @deprecate_nonkeyword_arguments( version=None, allowed_args=["self", "lower", "upper"] ) + @validate_bool_kwargs_from_keywords('inplace') def clip( self: Series, lower=None, @@ -5718,6 +5746,7 @@ def clip( return super().clip(lower, upper, axis, inplace, *args, **kwargs) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) + @validate_bool_kwargs_from_keywords('inplace') def interpolate( self: Series, method: str = "linear", From aab452391342eb892c748a1a07c8900291cb5255 Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 02:12:58 -0600 Subject: [PATCH 7/8] DOC: updated whatsnew pandas-dev#16714 --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 931d18dc349f3..7828e05892887 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -125,6 +125,7 @@ Other enhancements - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) +- Added validation of boolean kwargs in string series and DataFrame methods, along with tests (:issue:`16714`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: From 3cab1f28c3ab31eba819dafd8bb29d58c177a6f1 Mon Sep 17 00:00:00 2001 From: Lance <46547065+Condielj@users.noreply.github.com> Date: Wed, 4 May 2022 04:07:12 -0600 Subject: [PATCH 8/8] ENH: validate bool kwargs (#16714) --- pandas/util/_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index af0be30ab2edd..757c84feedb3f 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -552,7 +552,7 @@ def validate_bool_kwargs_from_keywords_inner(func): @functools.wraps(func) def validator(*args, **kwargs): for word in words.intersection(kwargs.keys()): - validate_bool_kwarg(kwargs[kw], kw) + validate_bool_kwarg(kwargs[word], word) return func(*args, **kwargs) return validator return validate_bool_kwargs_from_keywords_inner \ No newline at end of file