diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0f35087dc922..0c1e4e330c903 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -119,6 +119,36 @@ repos: entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" types: [python] exclude: ^(asv_bench|pandas/tests|doc)/ + - id: FrameOrSeriesUnion + name: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias + entry: Union\[.*(Series.*DataFrame|DataFrame.*Series).*\] + language: pygrep + types: [python] + exclude: ^pandas/_typing\.py$ + - id: type-not-class + name: Check for use of foo.__class__ instead of type(foo) + entry: \.__class__ + language: pygrep + files: \.(py|pyx)$ + - id: unwanted-typing + name: Check for use of comment-based annotation syntax and missing error codes + entry: | + (?x) + \#\ type:\ (?!ignore)| + \#\ type:\s?ignore(?!\[) + language: pygrep + types: [python] + - id: no-os-remove + name: Check code for instances of os.remove + entry: os\.remove + language: pygrep + types: [python] + files: ^pandas/tests/ + exclude: | + (?x)^ + pandas/tests/io/excel/test_writers\.py| + pandas/tests/io/pytables/common\.py| + pandas/tests/io/pytables/test_store\.py$ - repo: https://github.com/asottile/yesqa rev: v1.2.2 hooks: diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7c48905135f89..b5d63e259456b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,29 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" # ------------------------------------------------------------------------- - # Type annotations - - MSG='Check for use of comment-based annotation syntax' ; echo $MSG - invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for missing error codes with # type: ignore' ; echo $MSG - invgrep -R --include="*.py" -P '# type:\s?ignore(?!\[)' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias' ; echo $MSG - invgrep -R --include="*.py" --exclude=_typing.py -E 'Union\[.*(Series.*DataFrame|DataFrame.*Series).*\]' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # ------------------------------------------------------------------------- - MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG - invgrep -R --include=*.{py,pyx} '\.__class__' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check code for instances of os.remove' ; echo $MSG - invgrep -R --include="*.py*" --exclude "common.py" --exclude "test_writers.py" --exclude "test_store.py" -E "os\.remove" pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for inconsistent use of pandas namespace in tests' ; echo $MSG for class in "Series" "DataFrame" "Index" "MultiIndex" "Timestamp" "Timedelta" "TimedeltaIndex" "DatetimeIndex" "Categorical"; do check_namespace ${class} diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 4493ddd0b2822..98c981539d207 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -422,6 +422,17 @@ above example, ``s.loc[1:6]`` would raise ``KeyError``. For the rationale behind this behavior, see :ref:`Endpoints are inclusive `. +.. ipython:: python + + s = pd.Series(list('abcdef'), index=[0, 3, 2, 5, 4, 2]) + s.loc[3:5] + +Also, if the index has duplicate labels *and* either the start or the stop label is dupulicated, +an error will be raised. For instance, in the above example, ``s.loc[2:5]`` would raise a ``KeyError``. + +For more information about duplicate labels, see +:ref:`Duplicate Labels `. + .. _indexing.integer: Selection by position diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index cf728d94b2a55..a122154904996 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -23,7 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f137302d4994..690e6b8f725ad 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -217,6 +217,7 @@ Other enhancements - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). @@ -229,6 +230,7 @@ Other enhancements - :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) +- :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) .. _whatsnew_120.api_breaking.python: @@ -340,6 +342,9 @@ Deprecations - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) - :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) +- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`) +- :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`) + .. --------------------------------------------------------------------------- @@ -390,14 +395,18 @@ Datetimelike - Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) - :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) - Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) +- :meth:`to_json` and :meth:`read_json` now implements timezones parsing when orient structure is 'table'. +- :meth:`astype` now attempts to convert to 'datetime64[ns, tz]' directly from 'object' with inferred timezone from string (:issue:`35973`). - Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) - Bug in :meth:`DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) - Bug in adding a :class:`BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) +- Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`) Timedelta ^^^^^^^^^ - Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) - Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`) +- Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`) Timezones ^^^^^^^^^ @@ -427,7 +436,7 @@ Numeric Conversion ^^^^^^^^^^ -- +- Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetimelike columns (:issue:`21256`) - Strings @@ -455,6 +464,7 @@ Indexing - Bug in :meth:`Series.loc.__getitem__` with a non-unique :class:`MultiIndex` and an empty-list indexer (:issue:`13691`) - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` with a level named "0" (:issue:`37194`) - Bug in :meth:`Series.__getitem__` when using an unsigned integer array as an indexer giving incorrect results or segfaulting instead of raising ``KeyError`` (:issue:`37218`) +- Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`) Missing ^^^^^^^ @@ -467,7 +477,7 @@ MultiIndex - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) -- +- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) I/O ^^^ @@ -491,6 +501,8 @@ I/O - Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) - Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) +- Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) Plotting ^^^^^^^^ @@ -522,6 +534,7 @@ Groupby/resample/rolling - Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`) - Bug in :meth:`df.groupby(..).quantile() ` and :meth:`df.resample(..).quantile() ` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) - Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) +- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) Reshaping ^^^^^^^^^ @@ -534,6 +547,8 @@ Reshaping - Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns both multiindexed (:issue:`36360`) - Bug in :func:`join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) - Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) +- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`) +- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) Sparse ^^^^^^ @@ -556,7 +571,7 @@ Other - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`) -- Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors and :class:`DataFrame.duplicated` and :class:`DataFrame.stack` and :class:`DataFrame.unstack` and :class:`DataFrame.pivot` methods (:issue:`28283`) +- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`) (:issue:`37381`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) - Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`). diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 0b802f2cc9e69..512b638fc4877 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -392,7 +392,7 @@ class option_context(ContextDecorator): """ def __init__(self, *args): - if not (len(args) % 2 == 0 and len(args) >= 2): + if len(args) % 2 != 0 or len(args) < 2: raise ValueError( "Need to invoke as option_context(pat, val, [(pat, val), ...])." ) @@ -648,7 +648,7 @@ def _build_option_description(k: str) -> str: s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]" if d: - rkey = d.rkey if d.rkey else "" + rkey = d.rkey or "" s += "\n (Deprecated" s += f", use `{rkey}` instead." s += ")" diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 3933c8f3d519c..bc76aca93da2a 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -99,8 +99,7 @@ def _valid_locales(locales, normalize): def _default_locale_getter(): - raw_locales = subprocess.check_output(["locale -a"], shell=True) - return raw_locales + return subprocess.check_output(["locale -a"], shell=True) def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter): diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 13c7187923473..1b79d68c13570 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -268,7 +268,7 @@ ctypedef fused join_t: @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique(join_t[:] left, join_t[:] right): +def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t] indexer diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e493e5e9d41d3..0b0334d52c1e9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -896,21 +896,28 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, if lab != cur: if lab != -1: - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][i - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) - + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][i - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][i - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:i] start = i cur = lab - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][n - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][n - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][n - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:] return result diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b87e46f9b6648..4b7a47c5f93c2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,15 +1,10 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license -import bz2 from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC from errno import ENOENT -import gzip -import io -import os import sys import time import warnings -import zipfile from libc.stdlib cimport free from libc.string cimport strcasecmp, strlen, strncpy @@ -17,7 +12,7 @@ from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t -from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString +from cpython.bytes cimport PyBytes_AsString from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF @@ -67,7 +62,6 @@ from pandas._libs.khash cimport ( khiter_t, ) -from pandas.compat import get_lzma_file, import_lzma from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( @@ -82,11 +76,10 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.concat import union_categoricals -lzma = import_lzma() - cdef: float64_t INF = np.inf float64_t NEGINF = -INF + int64_t DEFAULT_CHUNKSIZE = 256 * 1024 cdef extern from "headers/portable.h": @@ -275,14 +268,15 @@ cdef extern from "parser/io.h": size_t *bytes_read, int *status) -DEFAULT_CHUNKSIZE = 256 * 1024 - - cdef class TextReader: """ # source: StringIO or file object + ..versionchange:: 1.2.0 + removed 'compression', 'memory_map', and 'encoding' argument. + These arguments are outsourced to CParserWrapper. + 'source' has to be a file handle. """ cdef: @@ -299,7 +293,7 @@ cdef class TextReader: cdef public: int64_t leading_cols, table_width, skipfooter, buffer_lines - bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory + bint allow_leading_cols, mangle_dupe_cols, low_memory bint delim_whitespace object delimiter, converters object na_values @@ -307,8 +301,6 @@ cdef class TextReader: object index_col object skiprows object dtype - object encoding - object compression object usecols list dtype_cast_order set unnamed_cols @@ -321,10 +313,8 @@ cdef class TextReader: header_end=0, index_col=None, names=None, - bint memory_map=False, tokenize_chunksize=DEFAULT_CHUNKSIZE, bint delim_whitespace=False, - compression=None, converters=None, bint skipinitialspace=False, escapechar=None, @@ -332,7 +322,6 @@ cdef class TextReader: quotechar=b'"', quoting=0, lineterminator=None, - encoding=None, comment=None, decimal=b'.', thousands=None, @@ -356,15 +345,7 @@ cdef class TextReader: bint skip_blank_lines=True): # set encoding for native Python and C library - if encoding is not None: - if not isinstance(encoding, bytes): - encoding = encoding.encode('utf-8') - encoding = encoding.lower() - self.c_encoding = encoding - else: - self.c_encoding = NULL - - self.encoding = encoding + self.c_encoding = NULL self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -374,9 +355,6 @@ cdef class TextReader: # For timekeeping self.clocks = [] - self.compression = compression - self.memory_map = memory_map - self.parser.usecols = (usecols is not None) self._setup_parser_source(source) @@ -562,11 +540,6 @@ cdef class TextReader: parser_del(self.parser) def close(self): - # we need to properly close an open derived - # filehandle here, e.g. and UTFRecoder - if self.handle is not None: - self.handle.close() - # also preemptively free all allocated memory parser_free(self.parser) if self.true_set: @@ -614,82 +587,15 @@ cdef class TextReader: cdef: void *ptr - self.parser.cb_io = NULL - self.parser.cb_cleanup = NULL - - if self.compression: - if self.compression == 'gzip': - if isinstance(source, str): - source = gzip.GzipFile(source, 'rb') - else: - source = gzip.GzipFile(fileobj=source) - elif self.compression == 'bz2': - source = bz2.BZ2File(source, 'rb') - elif self.compression == 'zip': - zip_file = zipfile.ZipFile(source) - zip_names = zip_file.namelist() - - if len(zip_names) == 1: - file_name = zip_names.pop() - source = zip_file.open(file_name) - - elif len(zip_names) == 0: - raise ValueError(f'Zero files found in compressed ' - f'zip file {source}') - else: - raise ValueError(f'Multiple files found in compressed ' - f'zip file {zip_names}') - elif self.compression == 'xz': - if isinstance(source, str): - source = get_lzma_file(lzma)(source, 'rb') - else: - source = get_lzma_file(lzma)(filename=source) - else: - raise ValueError(f'Unrecognized compression type: ' - f'{self.compression}') - - if (self.encoding and hasattr(source, "read") and - not hasattr(source, "encoding")): - source = io.TextIOWrapper( - source, self.encoding.decode('utf-8'), newline='') - - self.encoding = b'utf-8' - self.c_encoding = self.encoding - - self.handle = source - - if isinstance(source, str): - encoding = sys.getfilesystemencoding() or "utf-8" - usource = source - source = source.encode(encoding) - - if self.memory_map: - ptr = new_mmap(source) - if ptr == NULL: - # fall back - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - else: - self.parser.cb_io = &buffer_mmap_bytes - self.parser.cb_cleanup = &del_mmap - else: - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - self.parser.source = ptr - - elif hasattr(source, 'read'): - # e.g., StringIO - - ptr = new_rd_source(source) - self.parser.source = ptr - self.parser.cb_io = &buffer_rd_bytes - self.parser.cb_cleanup = &del_rd_source - else: + if not hasattr(source, "read"): raise IOError(f'Expected file path name or file-like object, ' f'got {type(source)} type') + ptr = new_rd_source(source) + self.parser.source = ptr + self.parser.cb_io = &buffer_rd_bytes + self.parser.cb_cleanup = &del_rd_source + cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index b1f9ff71f5faa..b817d80c64ccd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2438,7 +2438,7 @@ cpdef int freq_to_dtype_code(BaseOffset freq) except? -1: try: return freq._period_dtype_code except AttributeError as err: - raise ValueError(INVALID_FREQ_ERR_MSG) from err + raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) from err cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day, diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d2690be905a68..bc4632ad028ab 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -12,7 +12,7 @@ from _thread import allocate_lock as _thread_allocate_lock import numpy as np import pytz -from numpy cimport int64_t +from numpy cimport int64_t, ndarray from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -51,7 +51,7 @@ cdef dict _parse_code_table = {'y': 0, 'u': 22} -def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise'): +def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): """ Calculates the datetime structs represented by the passed array of strings diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 45f32d92c7a74..29e8c58055f9e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -227,7 +227,7 @@ cdef convert_to_timedelta64(object ts, str unit): @cython.boundscheck(False) @cython.wraparound(False) -def array_to_timedelta64(object[:] values, str unit=None, str errors="raise"): +def array_to_timedelta64(ndarray[object] values, str unit=None, str errors="raise"): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index b2dbf7802e6f0..3556085bb300b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1,14 +1,13 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import cython -from cython import Py_ssize_t from libcpp.deque cimport deque import numpy as np cimport numpy as cnp -from numpy cimport float32_t, float64_t, int64_t, ndarray, uint8_t +from numpy cimport float32_t, float64_t, int64_t, ndarray cnp.import_array() @@ -1398,7 +1397,7 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, # ---------------------------------------------------------------------- # Exponentially weighted moving average -def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, +def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times, int64_t halflife): """ Compute exponentially-weighted moving average using halflife and time @@ -1416,30 +1415,40 @@ def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, ndarray """ cdef: - Py_ssize_t i, num_not_nan = 0, N = len(vals) + Py_ssize_t i, j, num_not_nan = 0, N = len(vals) bint is_not_nan - float64_t last_result - ndarray[uint8_t] mask = np.zeros(N, dtype=np.uint8) - ndarray[float64_t] weights, observations, output = np.empty(N, dtype=np.float64) + float64_t last_result, weights_dot, weights_sum, weight, halflife_float + float64_t[:] times_float + float64_t[:] observations = np.zeros(N, dtype=float) + float64_t[:] times_masked = np.zeros(N, dtype=float) + ndarray[float64_t] output = np.empty(N, dtype=float) if N == 0: return output + halflife_float = halflife + times_float = times.astype(float) last_result = vals[0] - for i in range(N): - is_not_nan = vals[i] == vals[i] - num_not_nan += is_not_nan - if is_not_nan: - mask[i] = 1 - weights = 0.5 ** ((times[i] - times[mask.view(np.bool_)]) / halflife) - observations = vals[mask.view(np.bool_)] - last_result = np.sum(weights * observations) / np.sum(weights) - - if num_not_nan >= minp: - output[i] = last_result - else: - output[i] = NaN + with nogil: + for i in range(N): + is_not_nan = vals[i] == vals[i] + num_not_nan += is_not_nan + if is_not_nan: + times_masked[num_not_nan-1] = times_float[i] + observations[num_not_nan-1] = vals[i] + + weights_sum = 0 + weights_dot = 0 + for j in range(num_not_nan): + weight = 0.5 ** ( + (times_float[i] - times_masked[j]) / halflife_float) + weights_sum += weight + weights_dot += weight * observations[j] + + last_result = weights_dot / weights_sum + + output[i] = last_result if num_not_nan >= minp else NaN return output diff --git a/pandas/_testing.py b/pandas/_testing.py index 427585704ba58..ded2ed3141b47 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -736,8 +736,7 @@ def _get_ilevel_values(index, level): unique = index.levels[level] level_codes = index.codes[level] filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) - values = unique._shallow_copy(filled, name=index.names[level]) - return values + return unique._shallow_copy(filled, name=index.names[level]) if check_less_precise is not no_default: warnings.warn( @@ -1885,8 +1884,7 @@ def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): def makePeriodIndex(k=10, name=None, **kwargs): dt = datetime(2000, 1, 1) - dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) - return dr + return pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) def makeMultiIndex(k=10, names=None, **kwargs): @@ -2525,9 +2523,12 @@ def network( @wraps(t) def wrapper(*args, **kwargs): - if check_before_test and not raise_on_error: - if not can_connect(url, error_classes): - skip() + if ( + check_before_test + and not raise_on_error + and not can_connect(url, error_classes) + ): + skip() try: return t(*args, **kwargs) except Exception as err: @@ -2942,8 +2943,7 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): Expected output of to_csv() in current OS. """ sep = os.linesep - expected = sep.join(rows_list) + sep - return expected + return sep.join(rows_list) + sep def external_error_raised(expected_exception: Type[Exception]) -> ContextManager: diff --git a/pandas/_typing.py b/pandas/_typing.py index 3376559fb23ff..55a1c17b0aa53 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,6 +1,6 @@ -from dataclasses import dataclass from datetime import datetime, timedelta, tzinfo -from io import IOBase +from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper +from mmap import mmap from pathlib import Path from typing import ( IO, @@ -10,12 +10,12 @@ Callable, Collection, Dict, - Generic, Hashable, List, Mapping, Optional, Sequence, + Tuple, Type, TypeVar, Union, @@ -77,8 +77,6 @@ "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] -FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase] -FileOrBuffer = Union[str, IO[AnyStr], IOBase] # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series @@ -96,6 +94,7 @@ Label = Optional[Hashable] IndexLabel = Union[Label, Sequence[Label]] Level = Union[Label, int] +Shape = Tuple[int, ...] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Axes = Collection @@ -133,6 +132,10 @@ "Resampler", ] +# filenames and file-like-objects +Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap] +FileOrBuffer = Union[str, Buffer[T]] +FilePathOrBuffer = Union[Path, FileOrBuffer[T]] # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] @@ -150,21 +153,3 @@ # type of float formatter in DataFrameFormatter FloatFormatType = Union[str, Callable, "EngFormatter"] - - -@dataclass -class IOargs(Generic[ModeVar, EncodingVar]): - """ - Return value of io/common.py:get_filepath_or_buffer. - - Note (copy&past from io/parsers): - filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] - though mypy handling of conditional imports is difficult. - See https://github.com/python/mypy/issues/1297 - """ - - filepath_or_buffer: FileOrBuffer - encoding: EncodingVar - compression: CompressionDict - should_close: bool - mode: Union[ModeVar, str] diff --git a/pandas/_version.py b/pandas/_version.py index b3fa8530d09eb..d2df063ff3acf 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -22,8 +22,7 @@ def get_keywords(): # get_keywords(). git_refnames = "$Format:%d$" git_full = "$Format:%H$" - keywords = {"refnames": git_refnames, "full": git_full} - return keywords + return {"refnames": git_refnames, "full": git_full} class VersioneerConfig: @@ -121,17 +120,16 @@ def git_get_keywords(versionfile_abs): # _version.py. keywords = {} try: - f = open(versionfile_abs) - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - f.close() + with open(versionfile_abs) as fd: + for line in fd.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) except OSError: pass return keywords @@ -286,13 +284,11 @@ def render_pep440(pieces): if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += f"{pieces['distance']:d}.g{pieces['short']}" - if pieces["dirty"]: - rendered += ".dirty" else: # exception #1 rendered = f"0+untagged.{pieces['distance']:d}.g{pieces['short']}" - if pieces["dirty"]: - rendered += ".dirty" + if pieces["dirty"]: + rendered += ".dirty" return rendered @@ -348,13 +344,11 @@ def render_pep440_old(pieces): rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += f".post{pieces['distance']:d}" - if pieces["dirty"]: - rendered += ".dev0" else: # exception #1 rendered = f"0.post{pieces['distance']:d}" - if pieces["dirty"]: - rendered += ".dev0" + if pieces["dirty"]: + rendered += ".dev0" return rendered diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 57e378758cc78..2ac9b9e2c875c 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -50,7 +50,7 @@ def is_platform_windows() -> bool: bool True if the running platform is windows. """ - return sys.platform == "win32" or sys.platform == "cygwin" + return sys.platform in ["win32", "cygwin"] def is_platform_linux() -> bool: diff --git a/pandas/conftest.py b/pandas/conftest.py index 515d20e8c5781..b2daa2c5bc3f7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -290,6 +290,16 @@ def unique_nulls_fixture(request): # ---------------------------------------------------------------- # Classes # ---------------------------------------------------------------- + + +@pytest.fixture(params=[pd.DataFrame, pd.Series]) +def frame_or_series(request): + """ + Fixture to parametrize over DataFrame and Series. + """ + return request.param + + @pytest.fixture( params=[pd.Index, pd.Series], ids=["index", "series"] # type: ignore[list-item] ) @@ -386,13 +396,12 @@ def _create_multiindex(): major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ["first", "second"] - mi = MultiIndex( + return MultiIndex( levels=[major_axis, minor_axis], codes=[major_codes, minor_codes], names=index_names, verify_integrity=False, ) - return mi def _create_mi_with_dt64tz_level(): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e9e04ace784b6..ec88eb817b3f8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2061,27 +2061,25 @@ def safe_sort( dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) - def sort_mixed(values): - # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) - sorter = None + if ( not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == "mixed-integer" ): - # unorderable in py3 if mixed str/int - ordered = sort_mixed(values) + ordered = _sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: - # try this anyway - ordered = sort_mixed(values) + # Previous sorters failed or were not applicable, try `_sort_mixed` + # which would work, but which fails for special case of 1d arrays + # with tuples. + if values.size and isinstance(values[0], tuple): + ordered = _sort_tuples(values) + else: + ordered = _sort_mixed(values) # codes: @@ -2128,3 +2126,26 @@ def sort_mixed(values): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) + + +def _sort_mixed(values): + """ order ints before strings in 1d arrays, safe in py3 """ + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + +def _sort_tuples(values: np.ndarray[tuple]): + """ + Convert array of tuples (1d) to array or array (2d). + We need to keep the columns separately as they contain different types and + nans (can't use `np.sort` as it may fail when str and nan are mixed in a + column as types cannot be compared). + """ + from pandas.core.internals.construction import to_arrays + from pandas.core.sorting import lexsort_indexer + + arrays, _ = to_arrays(values, None) + indexer = lexsort_indexer(arrays, orders=True) + return values[indexer] diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 002e260742dc5..a14debce6eea7 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -362,8 +362,10 @@ def wrap_results_for_axis( isinstance(x, dict) for x in results.values() ): # Our operation was a to_dict op e.g. - # test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196 - return self.obj._constructor_sliced(results) + # test_apply_dict GH#8735, test_apply_reduce_to_dict GH#25196 #37544 + res = self.obj._constructor_sliced(results) + res.index = res_index + return res try: result = self.obj._constructor(data=results) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index aab5c5a110db8..67ac2a3688214 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -1,8 +1,9 @@ -from typing import Any, Sequence, Tuple, TypeVar +from typing import Any, Sequence, TypeVar import numpy as np from pandas._libs import lib +from pandas._typing import Shape from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc @@ -93,7 +94,7 @@ def _validate_fill_value(self, fill_value): # TODO: make this a cache_readonly; for that to work we need to remove # the _index_data kludge in libreduction @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: return self._ndarray.shape def __len__(self) -> int: @@ -252,3 +253,24 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): else: msg = f"'{type(self).__name__}' does not implement reduction '{name}'" raise TypeError(msg) + + # ------------------------------------------------------------------------ + + def __repr__(self) -> str: + if self.ndim == 1: + return super().__repr__() + + from pandas.io.formats.printing import format_object_summary + + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + lines = [ + format_object_summary(x, self._formatter(), indent_for_name=False).rstrip( + ", \n" + ) + for x in self + ] + data = ",\n".join(lines) + class_name = f"<{type(self).__name__}>" + return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 57f8f11d4d04c..be105fd1f2a46 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -12,7 +12,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Shape from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -403,7 +403,7 @@ def dtype(self) -> ExtensionDtype: raise AbstractMethodError(self) @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: """ Return a tuple of the array dimensions. """ @@ -460,7 +460,7 @@ def astype(self, dtype, copy=True): if is_dtype_equal(dtype, self.dtype): if not copy: return self - elif copy: + else: return self.copy() if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -544,14 +544,13 @@ def argsort( ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) values = self._values_for_argsort() - result = nargsort( + return nargsort( values, kind=kind, ascending=ascending, na_position=na_position, mask=np.asarray(self.isna()), ) - return result def argmin(self): """ @@ -780,12 +779,12 @@ def equals(self, other: object) -> bool: boolean Whether the arrays are equivalent. """ - if not type(self) == type(other): + if type(self) != type(other): return False other = cast(ExtensionArray, other) if not is_dtype_equal(self.dtype, other.dtype): return False - elif not len(self) == len(other): + elif len(self) != len(other): return False else: equal_values = self == other diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 73aa97c832848..21306455573b8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -170,12 +170,13 @@ def coerce_to_array( values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's - if inferred_dtype in integer_like: - if not np.all( + if (inferred_dtype in integer_like) and not ( + np.all( values[~mask_values].astype(float) == values_object[~mask_values].astype(float) - ): - raise TypeError("Need to pass bool-like values") + ) + ): + raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) @@ -193,9 +194,9 @@ def coerce_to_array( if mask_values is not None: mask = mask | mask_values - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D list-like") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D list-like") return values, mask @@ -395,9 +396,8 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False ) # for integer, error if there are missing values - if is_integer_dtype(dtype): - if self._hasna: - raise ValueError("cannot convert NA to integer") + if is_integer_dtype(dtype) and self._hasna: + raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) na_value = self._na_value @@ -576,7 +576,7 @@ def _logical_method(self, other, op): elif isinstance(other, np.bool_): other = other.item() - if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): + if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other): raise TypeError( "'other' should be pandas.NA or a bool. " f"Got {type(other).__name__} instead." diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 03f66ff82ad75..626fb495dec03 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -78,7 +78,7 @@ def func(self, other): # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." - if not self.is_dtype_equal(other): + if not self._categories_match_up_to_permutation(other): raise TypeError(msg) if not self.ordered and not self.categories.equals(other.categories): @@ -1177,11 +1177,6 @@ def map(self, mapper): # ------------------------------------------------------------- # Validators; ideally these can be de-duplicated - def _validate_where_value(self, value): - if is_scalar(value): - return self._validate_fill_value(value) - return self._validate_listlike(value) - def _validate_insert_value(self, value) -> int: return self._validate_fill_value(value) @@ -1319,8 +1314,7 @@ def isna(self): Categorical.notna : Boolean inverse of Categorical.isna. """ - ret = self._codes == -1 - return ret + return self._codes == -1 isnull = isna @@ -1368,7 +1362,7 @@ def value_counts(self, dropna=True): from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories - ncat, mask = len(cat), 0 <= code + ncat, mask = (len(cat), code >= 0) ix, clean = np.arange(ncat), mask.all() if dropna or clean: @@ -1660,21 +1654,15 @@ def fillna(self, value=None, method=None, limit=None): codes = self._ndarray.copy() mask = self.isna() + new_codes = self._validate_setitem_value(value) + if isinstance(value, (np.ndarray, Categorical)): # We get ndarray or Categorical if called via Series.fillna, # where it will unwrap another aligned Series before getting here - - not_categories = ~algorithms.isin(value, self.categories) - if not isna(value[not_categories]).all(): - # All entries in `value` must either be a category or NA - raise ValueError("fill value must be in categories") - - values_codes = _get_codes_for_values(value, self.categories) - codes[mask] = values_codes[mask] + codes[mask] = new_codes[mask] else: - new_code = self._validate_fill_value(value) - codes[mask] = new_code + codes[mask] = new_codes return self._from_backing_data(codes) @@ -1874,11 +1862,12 @@ def _validate_setitem_value(self, value): # require identical categories set if isinstance(value, Categorical): - if not is_dtype_equal(self, value): + if not is_dtype_equal(self.dtype, value.dtype): raise ValueError( "Cannot set a Categorical with another, " "without identical categories" ) + # is_dtype_equal implies categories_match_up_to_permutation new_codes = self._validate_listlike(value) value = Categorical.from_codes(new_codes, dtype=self.dtype) @@ -1930,8 +1919,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, _result)) - return result + return dict(zip(categories, _result)) # ------------------------------------------------------------------ # Reductions @@ -2112,7 +2100,7 @@ def equals(self, other: object) -> bool: """ if not isinstance(other, Categorical): return False - elif self.is_dtype_equal(other): + elif self._categories_match_up_to_permutation(other): other_codes = self._validate_listlike(other) return np.array_equal(self._codes, other_codes) return False @@ -2125,7 +2113,7 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ - def is_dtype_equal(self, other): + def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype same categories, and same ordered @@ -2138,8 +2126,17 @@ def is_dtype_equal(self, other): ------- bool """ + return hash(self.dtype) == hash(other.dtype) + + def is_dtype_equal(self, other) -> bool: + warn( + "Categorical.is_dtype_equal is deprecated and will be removed " + "in a future version", + FutureWarning, + stacklevel=2, + ) try: - return hash(self.dtype) == hash(other.dtype) + return self._categories_match_up_to_permutation(other) except (AttributeError, TypeError): return False diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f8a609fb0cabe..404511895ddf0 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -151,7 +151,9 @@ def _rebox_native(cls, value: int) -> Union[int, np.datetime64, np.timedelta64]: """ raise AbstractMethodError(cls) - def _unbox_scalar(self, value: DTScalarOrNaT, setitem: bool = False) -> int: + def _unbox_scalar( + self, value: DTScalarOrNaT, setitem: bool = False + ) -> Union[np.int64, np.datetime64, np.timedelta64]: """ Unbox the integer value of a scalar `value`. @@ -623,8 +625,6 @@ def _validate_setitem_value(self, value): return self._unbox(value, setitem=True) - _validate_where_value = _validate_setitem_value - def _validate_insert_value(self, value): value = self._validate_scalar(value) @@ -638,7 +638,6 @@ def _unbox( """ if lib.is_scalar(other): other = self._unbox_scalar(other, setitem=setitem) - other = self._rebox_native(other) else: # same type as self self._check_compatible_with(other, setitem=setitem) @@ -1026,9 +1025,8 @@ def _addsub_object_array(self, other: np.ndarray, op): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1: + if len(other) == 1 and self.ndim == 1: # If both 1D then broadcasting is unambiguous - # TODO(EA2D): require self.ndim == other.ndim here return op(self, other[0]) warnings.warn( @@ -1064,8 +1062,7 @@ def _time_shift(self, periods, freq=None): if isinstance(freq, str): freq = to_offset(freq) offset = periods * freq - result = self + offset - return result + return self + offset if periods == 0 or len(self) == 0: # GH#14811 empty case @@ -1535,10 +1532,9 @@ def _round(self, freq, mode, ambiguous, nonexistent): self = cast("DatetimeArray", self) naive = self.tz_localize(None) result = naive._round(freq, mode, ambiguous, nonexistent) - aware = result.tz_localize( + return result.tz_localize( self.tz, ambiguous=ambiguous, nonexistent=nonexistent ) - return aware values = self.view("i8") result = round_nsint64(values, mode, freq) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a1050f4271e05..905242bfdd8ad 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -454,16 +454,13 @@ def _generate_range( # ----------------------------------------------------------------- # DatetimeLike Interface - @classmethod - def _rebox_native(cls, value: int) -> np.datetime64: - return np.int64(value).view("M8[ns]") - - def _unbox_scalar(self, value, setitem: bool = False): + def _unbox_scalar(self, value, setitem: bool = False) -> np.datetime64: if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") if not isna(value): self._check_compatible_with(value, setitem=setitem) - return value.value + return value.asm8 + return np.datetime64(value.value, "ns") def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) @@ -566,7 +563,8 @@ def __iter__(self): tstamp : Timestamp """ if self.ndim > 1: - return (self[n] for n in range(len(self))) + for i in range(len(self)): + yield self[i] else: # convert in chunks of 10k for efficiency data = self.asi8 @@ -1970,7 +1968,13 @@ def sequence_to_dt64ns( data, inferred_tz = objects_to_datetime64ns( data, dayfirst=dayfirst, yearfirst=yearfirst ) - tz = _maybe_infer_tz(tz, inferred_tz) + if tz and inferred_tz: + # two timezones: convert to intended from base UTC repr + data = tzconversion.tz_convert_from_utc(data.view("i8"), tz) + data = data.view(DT64NS_DTYPE) + elif inferred_tz: + tz = inferred_tz + data_dtype = data.dtype # `data` may have originally been a Categorical[datetime64[ns, tz]], diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 161cf3bf3a677..f8ece2a9fe7d4 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,3 +1,4 @@ +import operator from operator import le, lt import textwrap from typing import TYPE_CHECKING, Optional, Tuple, Union, cast @@ -12,6 +13,7 @@ IntervalMixin, intervals_to_interval_bounds, ) +from pandas._libs.missing import NA from pandas._typing import ArrayLike, Dtype from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -48,7 +50,7 @@ from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index -from pandas.core.ops import unpack_zerodim_and_defer +from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer if TYPE_CHECKING: from pandas import Index @@ -520,8 +522,7 @@ def __setitem__(self, key, value): self._left[key] = value_left self._right[key] = value_right - @unpack_zerodim_and_defer("__eq__") - def __eq__(self, other): + def _cmp_method(self, other, op): # ensure pandas array for list-like and eliminate non-interval scalars if is_list_like(other): if len(self) != len(other): @@ -529,7 +530,7 @@ def __eq__(self, other): other = array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches - return np.zeros(len(self), dtype=bool) + return invalid_comparison(self, other, op) # determine the dtype of the elements we want to compare if isinstance(other, Interval): @@ -543,7 +544,8 @@ def __eq__(self, other): # extract intervals if we have interval categories with matching closed if is_interval_dtype(other_dtype): if self.closed != other.categories.closed: - return np.zeros(len(self), dtype=bool) + return invalid_comparison(self, other, op) + other = other.categories.take( other.codes, allow_fill=True, fill_value=other.categories._na_value ) @@ -551,27 +553,70 @@ def __eq__(self, other): # interval-like -> need same closed and matching endpoints if is_interval_dtype(other_dtype): if self.closed != other.closed: - return np.zeros(len(self), dtype=bool) - return (self._left == other.left) & (self._right == other.right) + return invalid_comparison(self, other, op) + elif not isinstance(other, Interval): + other = type(self)(other) + + if op is operator.eq: + return (self._left == other.left) & (self._right == other.right) + elif op is operator.ne: + return (self._left != other.left) | (self._right != other.right) + elif op is operator.gt: + return (self._left > other.left) | ( + (self._left == other.left) & (self._right > other.right) + ) + elif op is operator.ge: + return (self == other) | (self > other) + elif op is operator.lt: + return (self._left < other.left) | ( + (self._left == other.left) & (self._right < other.right) + ) + else: + # operator.lt + return (self == other) | (self < other) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): - return np.zeros(len(self), dtype=bool) + return invalid_comparison(self, other, op) # object dtype -> iteratively check for intervals result = np.zeros(len(self), dtype=bool) for i, obj in enumerate(other): - # need object to be an Interval with same closed and endpoints - if ( - isinstance(obj, Interval) - and self.closed == obj.closed - and self._left[i] == obj.left - and self._right[i] == obj.right - ): - result[i] = True - + try: + result[i] = op(self[i], obj) + except TypeError: + if obj is NA: + # comparison with np.nan returns NA + # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 + result[i] = op is operator.ne + else: + raise return result + @unpack_zerodim_and_defer("__eq__") + def __eq__(self, other): + return self._cmp_method(other, operator.eq) + + @unpack_zerodim_and_defer("__ne__") + def __ne__(self, other): + return self._cmp_method(other, operator.ne) + + @unpack_zerodim_and_defer("__gt__") + def __gt__(self, other): + return self._cmp_method(other, operator.gt) + + @unpack_zerodim_and_defer("__ge__") + def __ge__(self, other): + return self._cmp_method(other, operator.ge) + + @unpack_zerodim_and_defer("__lt__") + def __lt__(self, other): + return self._cmp_method(other, operator.lt) + + @unpack_zerodim_and_defer("__le__") + def __le__(self, other): + return self._cmp_method(other, operator.le) + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9febba0f544ac..b633f268049e5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -84,9 +84,9 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): "mask should be boolean numpy array. Use " "the 'pd.array' function instead" ) - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D array") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D array") if copy: @@ -209,7 +209,8 @@ def to_numpy( dtype = object if self._hasna: if ( - not (is_object_dtype(dtype) or is_string_dtype(dtype)) + not is_object_dtype(dtype) + and not is_string_dtype(dtype) and na_value is libmissing.NA ): raise ValueError( diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index cd48f6cbc8170..e1a424b719a4a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -281,17 +281,15 @@ def all(self, *, axis=None, out=None, keepdims=False, skipna=True): def min(self, *, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) - result = masked_reductions.min( + return masked_reductions.min( values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return result def max(self, *, skipna: bool = True, **kwargs) -> Scalar: nv.validate_max((), kwargs) - result = masked_reductions.max( + return masked_reductions.max( values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return result def sum(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_sum((), kwargs) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b95a7acc19b1f..8de84a0187e95 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -260,18 +260,14 @@ def _generate_range(cls, start, end, periods, freq, fields): # ----------------------------------------------------------------- # DatetimeLike Interface - @classmethod - def _rebox_native(cls, value: int) -> np.int64: - return np.int64(value) - def _unbox_scalar( self, value: Union[Period, NaTType], setitem: bool = False ) -> int: if value is NaT: - return value.value + return np.int64(value.value) elif isinstance(value, self._scalar_type): self._check_compatible_with(value, setitem=setitem) - return value.ordinal + return np.int64(value.ordinal) else: raise ValueError(f"'value' should be a Period. Got '{value}' instead.") @@ -593,7 +589,7 @@ def astype(self, dtype, copy: bool = True): if is_dtype_equal(dtype, self._dtype): if not copy: return self - elif copy: + else: return self.copy() if is_period_dtype(dtype): return self.asfreq(dtype.freq) @@ -1084,11 +1080,9 @@ def _make_field_arrays(*fields): elif length is None: length = len(x) - arrays = [ + return [ np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) else np.repeat(x, length) for x in fields ] - - return arrays diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4346e02069667..9152ce72d75aa 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -22,6 +22,7 @@ construct_1d_arraylike_from_scalar, find_common_type, infer_dtype_from_scalar, + maybe_box_datetimelike, ) from pandas.core.dtypes.common import ( is_array_like, @@ -316,9 +317,8 @@ def __init__( raise Exception("must only pass scalars with an index") if is_scalar(data): - if index is not None: - if data is None: - data = np.nan + if index is not None and data is None: + data = np.nan if index is not None: npoints = len(index) @@ -575,8 +575,7 @@ def density(self): >>> s.density 0.6 """ - r = float(self.sp_index.npoints) / float(self.sp_index.length) - return r + return float(self.sp_index.npoints) / float(self.sp_index.length) @property def npoints(self) -> int: @@ -736,25 +735,17 @@ def value_counts(self, dropna=True): keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps - if fcounts > 0: - if self._null_fill_value and dropna: - pass + if fcounts > 0 and (not self._null_fill_value or not dropna): + mask = isna(keys) if self._null_fill_value else keys == self.fill_value + if mask.any(): + counts[mask] += fcounts else: - if self._null_fill_value: - mask = isna(keys) - else: - mask = keys == self.fill_value - - if mask.any(): - counts[mask] += fcounts - else: - keys = np.insert(keys, 0, self.fill_value) - counts = np.insert(counts, 0, fcounts) + keys = np.insert(keys, 0, self.fill_value) + counts = np.insert(counts, 0, fcounts) if not isinstance(keys, ABCIndexClass): keys = Index(keys) - result = Series(counts, index=keys) - return result + return Series(counts, index=keys) # -------- # Indexing @@ -815,7 +806,7 @@ def _get_val_at(self, loc): return self.fill_value else: val = self.sp_values[sp_loc] - val = com.maybe_box_datetimelike(val, self.sp_values.dtype) + val = maybe_box_datetimelike(val, self.sp_values.dtype) return val def take(self, indices, allow_fill=False, fill_value=None) -> "SparseArray": @@ -1062,7 +1053,7 @@ def astype(self, dtype=None, copy=True): if is_dtype_equal(dtype, self._dtype): if not copy: return self - elif copy: + else: return self.copy() dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 0d9d257810674..8a87df18b6adb 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -227,8 +227,7 @@ def _from_sequence( data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) - result = cls._simple_new(data, freq=freq) - return result + return cls._simple_new(data, freq=freq) @classmethod def _from_sequence_not_strict( @@ -301,15 +300,11 @@ def _generate_range(cls, start, end, periods, freq, closed=None): # ---------------------------------------------------------------- # DatetimeLike Interface - @classmethod - def _rebox_native(cls, value: int) -> np.timedelta64: - return np.int64(value).view("m8[ns]") - - def _unbox_scalar(self, value, setitem: bool = False): + def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64: if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timedelta.") self._check_compatible_with(value, setitem=setitem) - return value.value + return np.timedelta64(value.value, "ns") def _scalar_from_string(self, value): return Timedelta(value) @@ -338,10 +333,9 @@ def astype(self, dtype, copy: bool = True): if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) - values = self._maybe_mask_results( + return self._maybe_mask_results( result, fill_value=None, convert="float64" ) - return values result = self._data.astype(dtype, copy=copy) return result.astype("i8") elif is_timedelta64_ns_dtype(dtype): @@ -352,7 +346,8 @@ def astype(self, dtype, copy: bool = True): def __iter__(self): if self.ndim > 1: - return (self[n] for n in range(len(self))) + for i in range(len(self)): + yield self[i] else: # convert in chunks of 10k for efficiency data = self.asi8 diff --git a/pandas/core/common.py b/pandas/core/common.py index b860c83f89cbc..9b6133d2f7627 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -6,7 +6,6 @@ from collections import abc, defaultdict import contextlib -from datetime import datetime, timedelta from functools import partial import inspect from typing import Any, Collection, Iterable, Iterator, List, Union, cast @@ -14,7 +13,7 @@ import numpy as np -from pandas._libs import lib, tslibs +from pandas._libs import lib from pandas._typing import AnyArrayLike, Scalar, T from pandas.compat.numpy import np_version_under1p18 @@ -78,21 +77,6 @@ def consensus_name_attr(objs): return name -def maybe_box_datetimelike(value, dtype=None): - # turn a datetime like into a Timestamp/timedelta as needed - if dtype == object: - # If we dont have datetime64/timedelta64 dtype, we dont want to - # box datetimelike scalars - return value - - if isinstance(value, (np.datetime64, datetime)): - value = tslibs.Timestamp(value) - elif isinstance(value, (np.timedelta64, timedelta)): - value = tslibs.Timedelta(value) - - return value - - def is_bool_indexer(key: Any) -> bool: """ Check whether `key` is a valid boolean indexer. @@ -347,23 +331,6 @@ def apply_if_callable(maybe_callable, obj, **kwargs): return maybe_callable -def dict_compat(d): - """ - Helper function to convert datetimelike-keyed dicts - to Timestamp-keyed dict. - - Parameters - ---------- - d: dict like object - - Returns - ------- - dict - - """ - return {maybe_box_datetimelike(key): value for key, value in d.items()} - - def standardize_mapping(into): """ Helper function to standardize a supplied mapping. diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 82867cf9dcd29..8a8b0d564ea49 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -38,8 +38,7 @@ def _align_core_single_unary_op( def _zip_axes_from_type( typ: Type[FrameOrSeries], new_axes: Sequence[int] ) -> Dict[str, int]: - axes = {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} - return axes + return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} def _any_pandas_objects(terms) -> bool: @@ -186,8 +185,11 @@ def reconstruct_object(typ, obj, axes, dtype): # The condition is to distinguish 0-dim array (returned in case of # scalar) and 1 element array # e.g. np.array(0) and np.array([0]) - if len(obj.shape) == 1 and len(obj) == 1: - if not isinstance(ret_value, np.ndarray): - ret_value = np.array([ret_value]).astype(res_t) + if ( + len(obj.shape) == 1 + and len(obj) == 1 + and not isinstance(ret_value, np.ndarray) + ): + ret_value = np.array([ret_value]).astype(res_t) return ret_value diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index b77204861f0a4..12f16343362e2 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -52,12 +52,11 @@ def _check_engine(engine: Optional[str]) -> str: # TODO: validate this in a more general way (thinking of future engines # that won't necessarily be import-able) # Could potentially be done on engine instantiation - if engine == "numexpr": - if not NUMEXPR_INSTALLED: - raise ImportError( - "'numexpr' is not installed or an unsupported version. Cannot use " - "engine='numexpr' for query/eval if 'numexpr' is not installed" - ) + if engine == "numexpr" and not NUMEXPR_INSTALLED: + raise ImportError( + "'numexpr' is not installed or an unsupported version. Cannot use " + "engine='numexpr' for query/eval if 'numexpr' is not installed" + ) return engine diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 8c56f02c8d3cc..c971551a7f400 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -496,15 +496,14 @@ def _maybe_evaluate_binop( f"'{lhs.type}' and '{rhs.type}'" ) - if self.engine != "pytables": - if ( - res.op in CMP_OPS_SYMS - and getattr(lhs, "is_datetime", False) - or getattr(rhs, "is_datetime", False) - ): - # all date ops must be done in python bc numexpr doesn't work - # well with NaT - return self._maybe_eval(res, self.binary_ops) + if self.engine != "pytables" and ( + res.op in CMP_OPS_SYMS + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index dd622ed724e8f..6ec637a8b4845 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -378,14 +378,14 @@ def prune(self, klass): operand = self.operand operand = operand.prune(klass) - if operand is not None: - if issubclass(klass, ConditionBinOp): - if operand.condition is not None: - return operand.invert() - elif issubclass(klass, FilterBinOp): - if operand.filter is not None: - return operand.invert() - + if operand is not None and ( + issubclass(klass, ConditionBinOp) + and operand.condition is not None + or not issubclass(klass, ConditionBinOp) + and issubclass(klass, FilterBinOp) + and operand.filter is not None + ): + return operand.invert() return None diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 2925f583bfc56..7a9b8caa985e3 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -144,8 +144,7 @@ def __init__( def __repr__(self) -> str: scope_keys = _get_pretty_string(list(self.scope.keys())) res_keys = _get_pretty_string(list(self.resolvers.keys())) - unicode_str = f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" - return unicode_str + return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" @property def has_resolvers(self) -> bool: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 692da8f8e021e..9758eae60c262 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -7,6 +7,7 @@ from typing import ( TYPE_CHECKING, Any, + Dict, List, Optional, Sequence, @@ -19,7 +20,7 @@ import numpy as np -from pandas._libs import lib, tslib +from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import ( NaT, OutOfBoundsDatetime, @@ -32,7 +33,7 @@ ints_to_pytimedelta, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar +from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -134,6 +135,30 @@ def is_nested_object(obj) -> bool: return False +def maybe_box_datetimelike(value: Scalar, dtype: Optional[Dtype] = None) -> Scalar: + """ + Cast scalar to Timestamp or Timedelta if scalar is datetime-like + and dtype is not object. + + Parameters + ---------- + value : scalar + dtype : Dtype, optional + + Returns + ------- + scalar + """ + if dtype == object: + pass + elif isinstance(value, (np.datetime64, datetime)): + value = tslibs.Timestamp(value) + elif isinstance(value, (np.timedelta64, timedelta)): + value = tslibs.Timedelta(value) + + return value + + def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): """ try to cast to the specified dtype (e.g. convert back to bool/int @@ -791,6 +816,22 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, return dtype, val +def dict_compat(d: Dict[Scalar, Scalar]) -> Dict[Scalar, Scalar]: + """ + Convert datetimelike-keyed dicts to a Timestamp-keyed dict. + + Parameters + ---------- + d: dict-like object + + Returns + ------- + dict + + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> Tuple[DtypeObj, ArrayLike]: @@ -1591,7 +1632,7 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: def cast_scalar_to_array( - shape: Tuple, value: Scalar, dtype: Optional[DtypeObj] = None + shape: Shape, value: Scalar, dtype: Optional[DtypeObj] = None ) -> np.ndarray: """ Create np.ndarray of specified shape and dtype, filled with values. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 60fd959701821..99dc01ef421d1 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -296,7 +296,7 @@ def _maybe_unwrap(x): raise TypeError("dtype of categories must be the same") ordered = False - if all(first.is_dtype_equal(other) for other in to_union[1:]): + if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5134529d9c21f..049d2c4888a69 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,6 +15,7 @@ import datetime from io import StringIO import itertools +import mmap from textwrap import dedent from typing import ( IO, @@ -83,6 +84,7 @@ find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, + maybe_box_datetimelike, maybe_cast_to_datetime, maybe_casted_values, maybe_convert_platform, @@ -131,7 +133,13 @@ from pandas.core.construction import extract_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase -from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + PeriodIndex, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager @@ -353,7 +361,7 @@ class DataFrame(NDFrame, OpsMixin): Parameters ---------- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame - Dict can contain Series, arrays, constants, or list-like objects. If + Dict can contain Series, arrays, constants, dataclass or list-like objects. If data is a dict, column order follows insertion-order. .. versionchanged:: 0.25.0 @@ -413,6 +421,16 @@ class DataFrame(NDFrame, OpsMixin): 0 1 2 3 1 4 5 6 2 7 8 9 + + Constructing DataFrame from dataclass: + + >>> from dataclasses import make_dataclass + >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) + >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + x y + 0 0 0 + 1 0 3 + 2 2 3 """ _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set @@ -1521,7 +1539,7 @@ def to_dict(self, orient="dict", into=dict): ( "data", [ - list(map(com.maybe_box_datetimelike, t)) + list(map(maybe_box_datetimelike, t)) for t in self.itertuples(index=False, name=None) ], ), @@ -1529,7 +1547,7 @@ def to_dict(self, orient="dict", into=dict): ) elif orient == "series": - return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) + return into_c((k, maybe_box_datetimelike(v)) for k, v in self.items()) elif orient == "records": columns = self.columns.tolist() @@ -1538,7 +1556,7 @@ def to_dict(self, orient="dict", into=dict): for row in self.itertuples(index=False, name=None) ) return [ - into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) + into_c((k, maybe_box_datetimelike(v)) for k, v in row.items()) for row in rows ] @@ -2280,10 +2298,9 @@ def to_markdown( if buf is None: return result ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options) - assert not isinstance(ioargs.filepath_or_buffer, str) + assert not isinstance(ioargs.filepath_or_buffer, (str, mmap.mmap)) ioargs.filepath_or_buffer.writelines(result) - if ioargs.should_close: - ioargs.filepath_or_buffer.close() + ioargs.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") @@ -6432,7 +6449,7 @@ def update( See Also -------- dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-columns(s) operations. + DataFrame.merge : For column(s)-on-column(s) operations. Examples -------- @@ -7395,7 +7412,7 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: return self - self.shift(periods, axis=axis) new_data = self._mgr.diff(n=periods, axis=bm_axis) - return self._constructor(new_data) + return self._constructor(new_data).__finalize__(self, "diff") # ---------------------------------------------------------------------- # Function application @@ -7774,7 +7791,7 @@ def infer(x): return lib.map_infer(x, func, ignore_na=ignore_na) return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) - return self.apply(infer) + return self.apply(infer).__finalize__(self, "applymap") # ---------------------------------------------------------------------- # Merging / joining methods @@ -7911,12 +7928,14 @@ def append( to_concat = [self, *other] else: to_concat = [self, other] - return concat( - to_concat, - ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, - ) + return ( + concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) + ).__finalize__(self, method="append") def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False @@ -7966,7 +7985,7 @@ def join( See Also -------- - DataFrame.merge : For column(s)-on-columns(s) operations. + DataFrame.merge : For column(s)-on-column(s) operations. Notes ----- @@ -9253,6 +9272,9 @@ def to_timestamp( axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) + if not isinstance(old_ax, PeriodIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + new_ax = old_ax.to_timestamp(freq=freq, how=how) setattr(new_obj, axis_name, new_ax) @@ -9282,6 +9304,9 @@ def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> DataFrame: axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) + if not isinstance(old_ax, DatetimeIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + new_ax = old_ax.to_period(freq=freq) setattr(new_obj, axis_name, new_ax) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c90ab9cceea8c..36ce2c4776bd0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2196,7 +2196,7 @@ def to_json( * Series: - default is 'index' - - allowed values are: {'split','records','index','table'}. + - allowed values are: {'split', 'records', 'index', 'table'}. * DataFrame: @@ -9347,10 +9347,13 @@ def shift( def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ Equivalent to `shift` without copying data. - The shifted data will not include the dropped periods and the shifted axis will be smaller than the original. + .. deprecated:: 1.2.0 + slice_shift is deprecated, + use DataFrame/Series.shift instead. + Parameters ---------- periods : int @@ -9365,6 +9368,14 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: While the `slice_shift` is faster than `shift`, you may pay for it later during alignment. """ + + msg = ( + "The 'slice_shift' method is deprecated " + "and will be removed in a future version. " + "You can use DataFrame/Series.shift instead" + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + if periods == 0: return self diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 2387427d15670..8e278dc81a8cc 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -63,9 +63,8 @@ def _gotitem(self, key, ndim, subset=None): self = type(self)(subset, groupby=groupby, parent=self, **kwargs) self._reset_cache() - if subset.ndim == 2: - if is_scalar(key) and key in subset or is_list_like(key): - self._selection = key + if subset.ndim == 2 and (is_scalar(key) and key in subset or is_list_like(key)): + self._selection = key return self diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 457aed3a72799..d0b58e8abc4ee 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -512,12 +512,9 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) - elif func in base.cythonized_kernels: + elif func in base.cythonized_kernels or func in base.transformation_kernels: # cythonized transform or canned "agg+broadcast" return getattr(self, func)(*args, **kwargs) - elif func in base.transformation_kernels: - return getattr(self, func)(*args, **kwargs) - # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. @@ -1111,8 +1108,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: # unwrap DataFrame to get array result = result._mgr.blocks[0].values - res_values = cast_agg_result(result, bvalues, how) - return res_values + return cast_agg_result(result, bvalues, how) # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block @@ -1368,12 +1364,9 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) - elif func in base.cythonized_kernels: + elif func in base.cythonized_kernels or func in base.transformation_kernels: # cythonized transformation or canned "reduction+broadcast" return getattr(self, func)(*args, **kwargs) - elif func in base.transformation_kernels: - return getattr(self, func)(*args, **kwargs) - # GH 30918 # Use _transform_fast only when we know func is an aggregation if func in base.reduction_kernels: @@ -1401,9 +1394,10 @@ def _transform_fast(self, result: DataFrame) -> DataFrame: # by take operation ids, _, ngroup = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) - output = [] - for i, _ in enumerate(result.columns): - output.append(algorithms.take_1d(result.iloc[:, i].values, ids)) + output = [ + algorithms.take_1d(result.iloc[:, i].values, ids) + for i, _ in enumerate(result.columns) + ] return self.obj._constructor._from_arrays( output, columns=result.columns, index=obj.index @@ -1462,7 +1456,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: else: inds.append(i) - if len(output) == 0: + if not output: raise TypeError("Transform function invalid for data types") columns = obj.columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c5bc9b563ea5e..32023576b0a91 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1001,7 +1001,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): key = base.OutputKey(label=name, position=idx) output[key] = result - if len(output) == 0: + if not output: raise DataError("No numeric types to aggregate") return self._wrap_transformed_output(output) @@ -1084,7 +1084,7 @@ def _cython_agg_general( output[key] = maybe_cast_result(result, obj, how=how) idx += 1 - if len(output) == 0: + if not output: raise DataError("No numeric types to aggregate") return self._wrap_aggregated_output(output, index=self.grouper.result_index) @@ -1182,7 +1182,7 @@ def _python_agg_general(self, func, *args, **kwargs): key = base.OutputKey(label=name, position=idx) output[key] = maybe_cast_result(result, obj, numeric_only=True) - if len(output) == 0: + if not output: return self._python_apply_general(f, self._selected_obj) if self.grouper._filter_empty_groups: @@ -2550,9 +2550,8 @@ def _get_cythonized_result( """ if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") - if post_processing: - if not callable(post_processing): - raise ValueError("'post_processing' must be a callable!") + if post_processing and not callable(post_processing): + raise ValueError("'post_processing' must be a callable!") if pre_processing: if not callable(pre_processing): raise ValueError("'pre_processing' must be a callable!") @@ -2631,7 +2630,7 @@ def _get_cythonized_result( output[key] = result # error_msg is "" on an frame/series with no rows or columns - if len(output) == 0 and error_msg != "": + if not output and error_msg != "": raise TypeError(error_msg) if aggregate: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9f0d953a2cc71..ff5379567f090 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -593,23 +593,25 @@ def group_index(self) -> Index: return self._group_index def _make_codes(self) -> None: - if self._codes is None or self._group_index is None: - # we have a list of groupers - if isinstance(self.grouper, ops.BaseGrouper): - codes = self.grouper.codes_info - uniques = self.grouper.result_index + if self._codes is not None and self._group_index is not None: + return + + # we have a list of groupers + if isinstance(self.grouper, ops.BaseGrouper): + codes = self.grouper.codes_info + uniques = self.grouper.result_index + else: + # GH35667, replace dropna=False with na_sentinel=None + if not self.dropna: + na_sentinel = None else: - # GH35667, replace dropna=False with na_sentinel=None - if not self.dropna: - na_sentinel = None - else: - na_sentinel = -1 - codes, uniques = algorithms.factorize( - self.grouper, sort=self.sort, na_sentinel=na_sentinel - ) - uniques = Index(uniques, name=self.name) - self._codes = codes - self._group_index = uniques + na_sentinel = -1 + codes, uniques = algorithms.factorize( + self.grouper, sort=self.sort, na_sentinel=na_sentinel + ) + uniques = Index(uniques, name=self.name) + self._codes = codes + self._group_index = uniques @cache_readonly def groups(self) -> Dict[Hashable, np.ndarray]: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bca71b5c9646b..438030008bb4d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -24,7 +24,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction -from pandas._typing import F, FrameOrSeries, Label +from pandas._typing import F, FrameOrSeries, Label, Shape from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -116,7 +116,7 @@ def groupings(self) -> List["grouper.Grouping"]: return self._groupings @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: return tuple(ping.ngroups for ping in self.groupings) def __iter__(self): @@ -140,9 +140,16 @@ def get_iterator( splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() for key, (i, group) in zip(keys, splitter): - yield key, group + yield key, group.__finalize__(data, method="groupby") def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": + """ + Returns + ------- + Generator yielding subsetted objects + + __finalize__ has not been called for the the subsetted objects returned. + """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -229,12 +236,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ - if len(self.groupings) == 1: - return self.groupings[0].indices - else: - codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + codes_list = [ping.codes for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return get_indexer_dict(codes_list, keys) @property def codes(self) -> List[np.ndarray]: @@ -318,10 +322,9 @@ def result_index(self) -> Index: codes = self.reconstructed_codes levels = [ping.result_index for ping in self.groupings] - result = MultiIndex( + return MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names ) - return result def get_group_levels(self) -> List[Index]: if not self.compressed and len(self.groupings) == 1: @@ -921,7 +924,8 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return type(sdata)(mgr, name=sdata.name, fastpath=True) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr, name=sdata.name, fastpath=True) class FrameSplitter(DataSplitter): @@ -937,7 +941,8 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return type(sdata)(mgr) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr) def get_splitter( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5ee5e867567b3..98ec3b55e65d9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,7 +27,7 @@ from pandas._libs.lib import is_datetime_array, no_default from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime, Timestamp from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label, final +from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label, Shape, final from pandas.compat.numpy import function as nv from pandas.errors import DuplicateLabelError, InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -40,7 +40,6 @@ ensure_int64, ensure_object, ensure_platform_int, - is_bool, is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, @@ -4079,23 +4078,16 @@ def where(self, cond, other=None): if other is None: other = self._na_value - dtype = self.dtype values = self.values - if is_bool(other) or is_bool_dtype(other): - - # bools force casting - values = values.astype(object) - dtype = None + try: + self._validate_fill_value(other) + except (ValueError, TypeError): + return self.astype(object).where(cond, other) values = np.where(cond, values, other) - if self._is_numeric_dtype and np.any(isna(values)): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index(values, dtype=dtype, name=self.name) + return Index(values, name=self.name) # construction helpers @final @@ -5652,7 +5644,7 @@ def _maybe_disable_logical_methods(self, opname: str_t): make_invalid_op(opname)(self) @property - def shape(self): + def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2f2836519d847..8cbd0d83c78d7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -255,7 +255,7 @@ def _is_dtype_compat(self, other) -> Categorical: """ if is_categorical_dtype(other): other = extract_array(other) - if not other.is_dtype_equal(self): + if not other._categories_match_up_to_permutation(self): raise TypeError( "categories must match existing categories when appending" ) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9215fc8994d87..9e2ac6013cb43 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -482,16 +482,9 @@ def isin(self, values, level=None): @Appender(Index.where.__doc__) def where(self, cond, other=None): - values = self._data._ndarray + other = self._data._validate_setitem_value(other) - try: - other = self._data._validate_where_value(other) - except (TypeError, ValueError) as err: - # Includes tzawareness mismatch and IncompatibleFrequencyError - oth = getattr(other, "dtype", other) - raise TypeError(f"Where requires matching dtype, not {oth}") from err - - result = np.where(cond, values, other) + result = np.where(cond, self._data._ndarray, other) arr = self._data._from_backing_data(result) return type(self)._simple_new(arr, name=self.name) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index d37ec12fd3eda..cd1871e4687f3 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -343,7 +343,7 @@ def insert(self, loc: int, item): def putmask(self, mask, value): try: - value = self._data._validate_where_value(value) + value = self._data._validate_setitem_value(value) except (TypeError, ValueError): return self.astype(object).putmask(mask, value) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1bd71f00b534d..c700acc24f411 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.cast import ( find_common_type, infer_dtype_from_scalar, + maybe_box_datetimelike, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -1074,19 +1075,6 @@ def _is_all_dates(self) -> bool: # TODO: arithmetic operations - # GH#30817 until IntervalArray implements inequalities, get them from Index - def __lt__(self, other): - return Index.__lt__(self, other) - - def __le__(self, other): - return Index.__le__(self, other) - - def __gt__(self, other): - return Index.__gt__(self, other) - - def __ge__(self, other): - return Index.__ge__(self, other) - def _is_valid_endpoint(endpoint) -> bool: """ @@ -1206,8 +1194,8 @@ def interval_range( IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], closed='both', dtype='interval[int64]') """ - start = com.maybe_box_datetimelike(start) - end = com.maybe_box_datetimelike(end) + start = maybe_box_datetimelike(start) + end = maybe_box_datetimelike(end) endpoint = start if start is not None else end if freq is None and com.any_none(periods, start, end): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dc5e6877a6bf5..65e71a6109a5a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -20,7 +20,7 @@ from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike, Label, Scalar +from pandas._typing import AnyArrayLike, Label, Scalar, Shape from pandas.compat.numpy import function as nv from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -702,7 +702,7 @@ def array(self): ) @property - def shape(self): + def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. """ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index d6f571360b457..9eb8a8b719d41 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -121,6 +121,8 @@ def _validate_fill_value(self, value): # force conversion to object # so we don't lose the bools raise TypeError + if isinstance(value, str): + raise TypeError return value diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c2dad928845a7..c5e331a104726 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1592,7 +1592,11 @@ def _setitem_with_indexer(self, indexer, value): return # add a new item with the dtype setup - self.obj[key] = infer_fill_value(value) + if com.is_null_slice(indexer[0]): + # We are setting an entire column + self.obj[key] = value + else: + self.obj[key] = infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes @@ -1641,6 +1645,8 @@ def _setitem_with_indexer_split_path(self, indexer, value): if not isinstance(indexer, tuple): indexer = _tuplify(self.ndim, indexer) + if len(indexer) > self.ndim: + raise IndexError("too many indices for array") if isinstance(value, ABCSeries): value = self._align_series(indexer, value) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 24b00199611bf..1f34e91d71077 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -10,7 +10,7 @@ from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike, Scalar +from pandas._typing import ArrayLike, Scalar, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -19,6 +19,7 @@ find_common_type, infer_dtype_from, infer_dtype_from_scalar, + maybe_box_datetimelike, maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_infer_dtype_type, @@ -843,7 +844,7 @@ def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: if isna(s): return ~mask - s = com.maybe_box_datetimelike(s) + s = maybe_box_datetimelike(s) return compare_or_regex_search(self.values, s, regex, mask) # Calculate the mask once, prior to the call of comp @@ -2762,7 +2763,7 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: return values -def safe_reshape(arr, new_shape): +def safe_reshape(arr, new_shape: Shape): """ If possible, reshape `arr` to have shape `new_shape`, with a couple of exceptions (see gh-13012): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 8559fe72972b8..8efba87b14ce5 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -5,7 +5,7 @@ import numpy as np from pandas._libs import NaT, internals as libinternals -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, Shape from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -175,7 +175,7 @@ def _get_mgr_concatenation_plan(mgr, indexers): class JoinUnit: - def __init__(self, block, shape, indexers=None): + def __init__(self, block, shape: Shape, indexers=None): # Passing shape explicitly is required for cases when block is None. if indexers is None: indexers = {} diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index bb8283604abb0..bcafa2c2fdca7 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -14,6 +14,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, + dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -346,7 +347,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): oindex = index.astype("O") if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): - val = com.dict_compat(val) + val = dict_compat(val) else: val = dict(val) val = lib.fast_multiget(val, oindex._values, default=np.nan) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 49ca8f9ad55e9..a06d57e268fe2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -17,7 +17,7 @@ import numpy as np from pandas._libs import internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label +from pandas._typing import ArrayLike, DtypeObj, Label, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -204,7 +204,7 @@ def __nonzero__(self) -> bool: __bool__ = __nonzero__ @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: return tuple(len(ax) for ax in self.axes) @property @@ -1825,7 +1825,7 @@ def _asarray_compat(x): else: return np.asarray(x) - def _shape_compat(x): + def _shape_compat(x) -> Shape: if isinstance(x, ABCSeries): return (len(x),) else: diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 97fa7988c1774..8142fc3e695a3 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -5,13 +5,13 @@ from datetime import timedelta from functools import partial import operator -from typing import Any, Tuple +from typing import Any import warnings import numpy as np from pandas._libs import Timedelta, Timestamp, lib, ops as libops -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Shape from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -427,7 +427,7 @@ def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: return obj -def _maybe_upcast_for_op(obj, shape: Tuple[int, ...]): +def _maybe_upcast_for_op(obj, shape: Shape): """ Cast non-pandas objects to pandas types to unify behavior of arithmetic and comparison operations. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5012be593820e..978597e3c7686 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -689,7 +689,7 @@ def get_result(self): self._maybe_restore_index_levels(result) - return result + return result.__finalize__(self, method="merge") def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" @@ -830,12 +830,15 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) # if we have an all missing left_indexer - # make sure to just use the right values - mask = left_indexer == -1 - if mask.all(): + # make sure to just use the right values or vice-versa + mask_left = left_indexer == -1 + mask_right = right_indexer == -1 + if mask_left.all(): key_col = rvals + elif mask_right.all(): + key_col = lvals else: - key_col = Index(lvals).where(~mask, rvals) + key_col = Index(lvals).where(~mask_left, rvals) if result._is_label_reference(name): result[name] = key_col @@ -1083,7 +1086,7 @@ def _maybe_coerce_merge_keys(self): # if either left or right is a categorical # then the must match exactly in categories & ordered if lk_is_cat and rk_is_cat: - if lk.is_dtype_equal(rk): + if lk._categories_match_up_to_permutation(rk): continue elif lk_is_cat or rk_is_cat: @@ -1502,7 +1505,7 @@ def get_result(self): ) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method=self._merge_type) + result = typ(result_data) self._maybe_add_join_keys(result, left_indexer, right_indexer) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 38e36c8ff8d01..c9940c78b8d7d 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -91,7 +91,7 @@ index. If a dict or Series is passed, the Series or dict VALUES will be used to determine the groups (the Series' values are first aligned; see ``.align()`` method). If an ndarray is passed, the - values are used as-is determine the groups. A label or list of + values are used as-is to determine the groups. A label or list of labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted as a (single) key. axis : {0 or 'index', 1 or 'columns'}, default 0 diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2e32a7572adc7..e390229b5dcba 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -4,6 +4,7 @@ TYPE_CHECKING, Callable, DefaultDict, + Dict, Iterable, List, Optional, @@ -528,16 +529,22 @@ def get_flattened_list( return [tuple(array) for array in arrays.values()] -def get_indexer_dict(label_list, keys): +def get_indexer_dict( + label_list: List[np.ndarray], keys: List["Index"] +) -> Dict[Union[str, Tuple], np.ndarray]: """ Returns ------- - dict + dict: Labels mapped to indexers. """ shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) + if np.all(group_index == -1): + # When all keys are nan and dropna=True, indices_fast can't handle this + # and the return is empty anyway + return {} ngroups = ( ((group_index.size and group_index.max()) + 1) if is_int64_overflow_possible(shape) diff --git a/pandas/io/common.py b/pandas/io/common.py index c147ae9fd0aa8..910eb23d9a2d0 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,8 +2,9 @@ import bz2 from collections import abc +import dataclasses import gzip -from io import BufferedIOBase, BytesIO, RawIOBase +from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper import mmap import os import pathlib @@ -13,12 +14,14 @@ Any, AnyStr, Dict, + Generic, List, Mapping, Optional, Tuple, Type, Union, + cast, ) from urllib.parse import ( urljoin, @@ -31,12 +34,12 @@ import zipfile from pandas._typing import ( + Buffer, CompressionDict, CompressionOptions, EncodingVar, FileOrBuffer, FilePathOrBuffer, - IOargs, ModeVar, StorageOptions, ) @@ -56,6 +59,77 @@ from io import IOBase +@dataclasses.dataclass +class IOArgs(Generic[ModeVar, EncodingVar]): + """ + Return value of io/common.py:get_filepath_or_buffer. + + This is used to easily close created fsspec objects. + + Note (copy&past from io/parsers): + filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] + though mypy handling of conditional imports is difficult. + See https://github.com/python/mypy/issues/1297 + """ + + filepath_or_buffer: FileOrBuffer + encoding: EncodingVar + mode: Union[ModeVar, str] + compression: CompressionDict + should_close: bool = False + + def close(self) -> None: + """ + Close the buffer if it was created by get_filepath_or_buffer. + """ + if self.should_close: + assert not isinstance(self.filepath_or_buffer, str) + try: + self.filepath_or_buffer.close() + except (OSError, ValueError): + pass + self.should_close = False + + +@dataclasses.dataclass +class IOHandles: + """ + Return value of io/common.py:get_handle + + This is used to easily close created buffers and to handle corner cases when + TextIOWrapper is inserted. + + handle: The file handle to be used. + created_handles: All file handles that are created by get_handle + is_wrapped: Whether a TextIOWrapper needs to be detached. + """ + + handle: Buffer + created_handles: List[Buffer] = dataclasses.field(default_factory=list) + is_wrapped: bool = False + is_mmap: bool = False + + def close(self) -> None: + """ + Close all created buffers. + + Note: If a TextIOWrapper was inserted, it is flushed and detached to + avoid closing the potentially user-created buffer. + """ + if self.is_wrapped: + assert isinstance(self.handle, TextIOWrapper) + self.handle.flush() + self.handle.detach() + self.created_handles.remove(self.handle) + try: + for handle in self.created_handles: + handle.close() + except (OSError, ValueError): + pass + self.created_handles = [] + self.is_wrapped = False + + def is_url(url) -> bool: """ Check to see if a URL has a valid protocol. @@ -176,7 +250,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, -) -> IOargs[ModeVar, EncodingVar]: +) -> IOArgs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -201,7 +275,7 @@ def get_filepath_or_buffer( ..versionchange:: 1.2.0 - Returns the dataclass IOargs. + Returns the dataclass IOArgs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -225,6 +299,10 @@ def get_filepath_or_buffer( compression = dict(compression, method=compression_method) + # uniform encoding names + if encoding is not None: + encoding = encoding.replace("_", "-").lower() + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( @@ -258,7 +336,7 @@ def get_filepath_or_buffer( compression = {"method": "gzip"} reader = BytesIO(req.read()) req.close() - return IOargs( + return IOArgs( filepath_or_buffer=reader, encoding=encoding, compression=compression, @@ -310,7 +388,7 @@ def get_filepath_or_buffer( filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() - return IOargs( + return IOArgs( filepath_or_buffer=file_obj, encoding=encoding, compression=compression, @@ -323,7 +401,7 @@ def get_filepath_or_buffer( ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): - return IOargs( + return IOArgs( filepath_or_buffer=_expand_user(filepath_or_buffer), encoding=encoding, compression=compression, @@ -335,7 +413,7 @@ def get_filepath_or_buffer( msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) - return IOargs( + return IOArgs( filepath_or_buffer=filepath_or_buffer, encoding=encoding, compression=compression, @@ -455,14 +533,14 @@ def infer_compression( def get_handle( - path_or_buf, + path_or_buf: FilePathOrBuffer, mode: str, - encoding=None, + encoding: Optional[str] = None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, - errors=None, -): + errors: Optional[str] = None, +) -> IOHandles: """ Get file handle for given path/buffer and mode. @@ -506,14 +584,9 @@ def get_handle( See the errors argument for :func:`open` for a full list of options. - .. versionadded:: 1.1.0 + .. versionchanged:: 1.2.0 - Returns - ------- - f : file-like - A file-like object. - handles : list of file-like objects - A list of file-like object that were opened in this function. + Returns the dataclass IOHandles """ need_text_wrapping: Tuple[Type["IOBase"], ...] try: @@ -532,41 +605,49 @@ def get_handle( except ImportError: pass - handles: List[Union[IO, _MMapWrapper]] = list() - f = path_or_buf + # Windows does not default to utf-8. Set to utf-8 for a consistent behavior + if encoding is None: + encoding = "utf-8" # Convert pathlib.Path/py.path.local or string - path_or_buf = stringify_path(path_or_buf) - is_path = isinstance(path_or_buf, str) + handle = stringify_path(path_or_buf) compression, compression_args = get_compression_method(compression) - if is_path: - compression = infer_compression(path_or_buf, compression) + compression = infer_compression(handle, compression) - if compression: + # memory mapping needs to be the first step + handle, memory_map, handles = _maybe_memory_map( + handle, memory_map, encoding, mode, errors + ) + is_path = isinstance(handle, str) + if compression: # GZ Compression if compression == "gzip": if is_path: - f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) + assert isinstance(handle, str) + handle = gzip.GzipFile(filename=handle, mode=mode, **compression_args) else: - f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) + handle = gzip.GzipFile( + fileobj=handle, # type: ignore[arg-type] + mode=mode, + **compression_args, + ) # BZ Compression elif compression == "bz2": - f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) + handle = bz2.BZ2File( + handle, mode=mode, **compression_args # type: ignore[arg-type] + ) # ZIP Compression elif compression == "zip": - zf = _BytesZipFile(path_or_buf, mode, **compression_args) - # Ensure the container is closed as well. - handles.append(zf) - if zf.mode == "w": - f = zf - elif zf.mode == "r": - zip_names = zf.namelist() + handle = _BytesZipFile(handle, mode, **compression_args) + if handle.mode == "r": + handles.append(handle) + zip_names = handle.namelist() if len(zip_names) == 1: - f = zf.open(zip_names.pop()) + handle = handle.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: @@ -577,55 +658,53 @@ def get_handle( # XZ Compression elif compression == "xz": - f = get_lzma_file(lzma)(path_or_buf, mode) + handle = get_lzma_file(lzma)(handle, mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) - handles.append(f) + assert not isinstance(handle, str) + handles.append(handle) elif is_path: # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. - is_binary_mode = "b" in mode - - if encoding and not is_binary_mode: + assert isinstance(handle, str) + if encoding and "b" not in mode: # Encoding - f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") - elif is_text and not is_binary_mode: - # No explicit encoding - f = open(path_or_buf, mode, errors="replace", newline="") + handle = open(handle, mode, encoding=encoding, errors=errors, newline="") else: # Binary mode - f = open(path_or_buf, mode) - handles.append(f) + handle = open(handle, mode) + handles.append(handle) # Convert BytesIO or file objects passed with an encoding - if is_text and (compression or isinstance(f, need_text_wrapping)): - from io import TextIOWrapper - - g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") - if not isinstance(f, (BufferedIOBase, RawIOBase)): - handles.append(g) - f = g - - if memory_map and hasattr(f, "fileno"): - try: - wrapped = _MMapWrapper(f) - f.close() - handles.remove(f) - handles.append(wrapped) - f = wrapped - except Exception: - # we catch any errors that may have occurred - # because that is consistent with the lower-level - # functionality of the C engine (pd.read_csv), so - # leave the file handler as is then - pass - - return f, handles + is_wrapped = False + if is_text and ( + compression + or isinstance(handle, need_text_wrapping) + or "b" in getattr(handle, "mode", "") + ): + handle = TextIOWrapper( + handle, # type: ignore[arg-type] + encoding=encoding, + errors=errors, + newline="", + ) + handles.append(handle) + # do not mark as wrapped when the user provided a string + is_wrapped = not is_path + + handles.reverse() # close the most recently added buffer first + assert not isinstance(handle, str) + return IOHandles( + handle=handle, + created_handles=handles, + is_wrapped=is_wrapped, + is_mmap=memory_map, + ) # error: Definition of "__exit__" in base class "ZipFile" is incompatible with @@ -688,9 +767,16 @@ class _MMapWrapper(abc.Iterator): """ def __init__(self, f: IO): + self.attributes = {} + for attribute in ("seekable", "readable", "writeable"): + if not hasattr(f, attribute): + continue + self.attributes[attribute] = getattr(f, attribute)() self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) def __getattr__(self, name: str): + if name in self.attributes: + return lambda: self.attributes[name] return getattr(self.mmap, name) def __iter__(self) -> "_MMapWrapper": @@ -709,3 +795,42 @@ def __next__(self) -> str: if newline == "": raise StopIteration return newline + + +def _maybe_memory_map( + handle: FileOrBuffer, + memory_map: bool, + encoding: str, + mode: str, + errors: Optional[str], +) -> Tuple[FileOrBuffer, bool, List[Buffer]]: + """Try to use memory map file/buffer.""" + handles: List[Buffer] = [] + memory_map &= hasattr(handle, "fileno") or isinstance(handle, str) + if not memory_map: + return handle, memory_map, handles + + # need to open the file first + if isinstance(handle, str): + if encoding and "b" not in mode: + # Encoding + handle = open(handle, mode, encoding=encoding, errors=errors, newline="") + else: + # Binary mode + handle = open(handle, mode) + handles.append(handle) + + try: + wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] + handle.close() + handles.remove(handle) + handles.append(wrapped) + handle = wrapped + except Exception: + # we catch any errors that may have occurred + # because that is consistent with the lower-level + # functionality of the C engine (pd.read_csv), so + # leave the file handler as is then + memory_map = False + + return handle, memory_map, handles diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3461652f4ea24..03c61c3ed8376 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -17,6 +17,7 @@ from pandas.core.frame import DataFrame from pandas.io.common import ( + IOArgs, get_filepath_or_buffer, is_url, stringify_path, @@ -349,24 +350,37 @@ def read_excel( class BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): + self.ioargs = IOArgs( + filepath_or_buffer=filepath_or_buffer, + encoding=None, + mode=None, + compression={"method": None}, + ) # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): - filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) + self.ioargs = IOArgs( + filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()), + should_close=True, + encoding=None, + mode=None, + compression={"method": None}, + ) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer = get_filepath_or_buffer( + self.ioargs = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options - ).filepath_or_buffer + ) - if isinstance(filepath_or_buffer, self._workbook_class): - self.book = filepath_or_buffer - elif hasattr(filepath_or_buffer, "read"): + if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class): + self.book = self.ioargs.filepath_or_buffer + elif hasattr(self.ioargs.filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too - filepath_or_buffer.seek(0) - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, str): - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, bytes): - self.book = self.load_workbook(BytesIO(filepath_or_buffer)) + assert not isinstance(self.ioargs.filepath_or_buffer, str) + self.ioargs.filepath_or_buffer.seek(0) + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, str): + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(self.ioargs.filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -382,7 +396,7 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): - pass + self.ioargs.close() @property @abc.abstractmethod diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 9a42b8289ab47..198acd5862d45 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -81,9 +81,7 @@ def to_feather( feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() def read_feather( @@ -137,9 +135,6 @@ def read_feather( ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads) ) - # s3fs only validates the credentials when the file is closed. - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() return df diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6c62d6825bc84..20226dbb3c9d4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -3,7 +3,6 @@ """ import csv as csvlib -from io import StringIO, TextIOWrapper import os from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union @@ -39,7 +38,7 @@ class CSVFormatter: def __init__( self, formatter: "DataFrameFormatter", - path_or_buf: Optional[FilePathOrBuffer[str]] = None, + path_or_buf: FilePathOrBuffer[str] = "", sep: str = ",", cols: Optional[Sequence[Label]] = None, index_label: Optional[IndexLabel] = None, @@ -60,25 +59,14 @@ def __init__( self.obj = self.fmt.frame - self.encoding = encoding or "utf-8" - - if path_or_buf is None: - path_or_buf = StringIO() - - ioargs = get_filepath_or_buffer( + self.ioargs = get_filepath_or_buffer( path_or_buf, - encoding=self.encoding, + encoding=encoding, compression=compression, mode=mode, storage_options=storage_options, ) - self.compression = ioargs.compression.pop("method") - self.compression_args = ioargs.compression - self.path_or_buf = ioargs.filepath_or_buffer - self.should_close = ioargs.should_close - self.mode = ioargs.mode - self.sep = sep self.index_label = self._initialize_index_label(index_label) self.errors = errors @@ -238,20 +226,19 @@ def save(self) -> None: """ Create the writer & save. """ - # get a handle or wrap an existing handle to take care of 1) compression and - # 2) text -> byte conversion - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, + # apply compression and byte/text conversion + handles = get_handle( + self.ioargs.filepath_or_buffer, + self.ioargs.mode, + encoding=self.ioargs.encoding, errors=self.errors, - compression=dict(self.compression_args, method=self.compression), + compression=self.ioargs.compression, ) try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( - f, + handles.handle, # type: ignore[arg-type] lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, @@ -263,23 +250,10 @@ def save(self) -> None: self._save() finally: - if self.should_close: - f.close() - elif ( - isinstance(f, TextIOWrapper) - and not f.closed - and f != self.path_or_buf - and hasattr(self.path_or_buf, "write") - ): - # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper - # closes the wrapped handle if it is not detached. - f.flush() # make sure everything is written - f.detach() # makes f unusable - del f - elif f != self.path_or_buf: - f.close() - for _fh in handles: - _fh.close() + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + self.ioargs.close() def _save(self) -> None: if self._need_to_save_header: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3c759f477899b..43e76d0aef490 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1046,8 +1046,12 @@ def to_csv( """ from pandas.io.formats.csvs import CSVFormatter + created_buffer = path_or_buf is None + if created_buffer: + path_or_buf = StringIO() + csv_formatter = CSVFormatter( - path_or_buf=path_or_buf, + path_or_buf=path_or_buf, # type: ignore[arg-type] line_terminator=line_terminator, sep=sep, encoding=encoding, @@ -1067,9 +1071,11 @@ def to_csv( ) csv_formatter.save() - if path_or_buf is None: - assert isinstance(csv_formatter.path_or_buf, StringIO) - return csv_formatter.path_or_buf.getvalue() + if created_buffer: + assert isinstance(path_or_buf, StringIO) + content = path_or_buf.getvalue() + path_or_buf.close() + return content return None diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 98b9a585d890e..040279b9f3e67 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from collections import abc import functools -from io import BytesIO, StringIO +from io import StringIO from itertools import islice import os -from typing import IO, Any, Callable, List, Mapping, Optional, Tuple, Type, Union +from typing import Any, Callable, Mapping, Optional, Tuple, Type, Union import numpy as np @@ -26,7 +26,12 @@ from pandas.core.generic import NDFrame from pandas.core.reshape.concat import concat -from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle +from pandas.io.common import ( + IOHandles, + get_compression_method, + get_filepath_or_buffer, + get_handle, +) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import validate_integer @@ -59,17 +64,6 @@ def to_json( "'index=False' is only valid when 'orient' is 'split' or 'table'" ) - if path_or_buf is not None: - ioargs = get_filepath_or_buffer( - path_or_buf, - compression=compression, - mode="wt", - storage_options=storage_options, - ) - path_or_buf = ioargs.filepath_or_buffer - should_close = ioargs.should_close - compression = ioargs.compression - if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") @@ -101,20 +95,27 @@ def to_json( if lines: s = convert_to_line_delimits(s) - if isinstance(path_or_buf, str): - fh, handles = get_handle(path_or_buf, "w", compression=compression) + if path_or_buf is not None: + # open fsspec URLs + ioargs = get_filepath_or_buffer( + path_or_buf, + compression=compression, + mode="wt", + storage_options=storage_options, + ) + # apply compression and byte/text conversion + handles = get_handle( + ioargs.filepath_or_buffer, "w", compression=ioargs.compression + ) try: - fh.write(s) + handles.handle.write(s) finally: - fh.close() - for handle in handles: - handle.close() - elif path_or_buf is None: - return s + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + ioargs.close() else: - path_or_buf.write(s) - if should_close: - path_or_buf.close() + return s class Writer(ABC): @@ -262,7 +263,9 @@ def __init__( # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): - raise NotImplementedError("orient='table' is not supported for MultiIndex") + raise NotImplementedError( + "orient='table' is not supported for MultiIndex columns" + ) # TODO: Do this timedelta properly in objToJSON.c See GH #15137 if ( @@ -545,12 +548,10 @@ def read_json( dtype = True if convert_axes is None and orient != "table": convert_axes = True - if encoding is None: - encoding = "utf-8" ioargs = get_filepath_or_buffer( path_or_buf, - encoding=encoding, + encoding=encoding or "utf-8", compression=compression, storage_options=storage_options, ) @@ -577,9 +578,7 @@ def read_json( return json_reader result = json_reader.read() - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() return result @@ -629,9 +628,8 @@ def __init__( self.lines = lines self.chunksize = chunksize self.nrows_seen = 0 - self.should_close = False self.nrows = nrows - self.file_handles: List[IO] = [] + self.handles: Optional[IOHandles] = None if self.chunksize is not None: self.chunksize = validate_integer("chunksize", self.chunksize, 1) @@ -670,30 +668,25 @@ def _get_data_from_filepath(self, filepath_or_buffer): This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ - data = filepath_or_buffer - + # if it is a string but the file does not exist, it might be a JSON string exists = False - if isinstance(data, str): + if isinstance(filepath_or_buffer, str): try: exists = os.path.exists(filepath_or_buffer) # gh-5874: if the filepath is too long will raise here except (TypeError, ValueError): pass - if exists or self.compression["method"] is not None: - data, self.file_handles = get_handle( + if exists or not isinstance(filepath_or_buffer, str): + self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, ) - self.should_close = True - self.open_stream = data - - if isinstance(data, BytesIO): - data = data.getvalue().decode() + filepath_or_buffer = self.handles.handle - return data + return filepath_or_buffer def _combine_lines(self, lines) -> str: """ @@ -757,13 +750,8 @@ def close(self): If an open stream or file was passed, we leave it open. """ - if self.should_close: - try: - self.open_stream.close() - except (OSError, AttributeError): - pass - for file_handle in self.file_handles: - file_handle.close() + if self.handles is not None: + self.handles.close() def __next__(self): if self.nrows: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 2b4c86b3c4406..0499a35296490 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -323,10 +323,6 @@ def parse_table_schema(json, precise_float): for field in table["schema"]["fields"] } - # Cannot directly use as_type with timezone data on object; raise for now - if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): - raise NotImplementedError('table="orient" can not yet read timezone data') - # No ISO constructor for Timedelta as of yet, so need to raise if "timedelta64" in dtypes.values(): raise NotImplementedError( diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 829ff6408d86d..5a734f0878a0c 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -53,4 +53,5 @@ def read_orc( ioargs = get_filepath_or_buffer(path) orc_file = pyarrow.orc.ORCFile(ioargs.filepath_or_buffer) result = orc_file.read(columns=columns, **kwargs).to_pandas() + ioargs.close() return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2110a2d400be8..e4895d280c241 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import StringIO, TextIOWrapper +from io import StringIO import itertools import re import sys @@ -428,17 +428,16 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" - encoding = kwds.get("encoding", None) storage_options = kwds.get("storage_options", None) - if encoding is not None: - encoding = re.sub("_", "-", encoding).lower() - kwds["encoding"] = encoding - compression = kwds.get("compression", "infer") ioargs = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, storage_options=storage_options + filepath_or_buffer, + kwds.get("encoding", None), + kwds.get("compression", "infer"), + storage_options=storage_options, ) kwds["compression"] = ioargs.compression + kwds["encoding"] = ioargs.encoding if kwds.get("date_parser", None) is not None: if isinstance(kwds["parse_dates"], bool): @@ -461,14 +460,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): try: data = parser.read(nrows) finally: + # close compression and byte/text wrapper parser.close() - - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - try: - ioargs.filepath_or_buffer.close() - except ValueError: - pass + # close any fsspec-like objects + ioargs.close() return data @@ -1350,10 +1345,6 @@ def __init__(self, kwds): self._first_chunk = True - # GH 13932 - # keep references to file handles opened by the parser itself - self.handles = [] - def _validate_parse_dates_presence(self, columns: List[str]) -> None: """ Check if parse_dates are in columns. @@ -1403,8 +1394,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: ) def close(self): - for f in self.handles: - f.close() + self.handles.close() @property def _has_complex_date_col(self): @@ -1838,23 +1828,19 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - encoding = kwds.get("encoding") - - # parsers.TextReader doesn't support compression dicts - if isinstance(kwds.get("compression"), dict): - kwds["compression"] = kwds["compression"]["method"] - - if kwds.get("compression") is None and encoding: - if isinstance(src, str): - src = open(src, "rb") - self.handles.append(src) - - # Handle the file object with universal line mode enabled. - # We will handle the newline character ourselves later on. - if hasattr(src, "read") and not hasattr(src, "encoding"): - src = TextIOWrapper(src, encoding=encoding, newline="") - - kwds["encoding"] = "utf-8" + self.handles = get_handle( + src, + mode="r", + encoding=kwds.get("encoding", None), + compression=kwds.get("compression", None), + memory_map=kwds.get("memory_map", False), + is_text=True, + ) + kwds.pop("encoding", None) + kwds.pop("memory_map", None) + kwds.pop("compression", None) + if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): + self.handles.handle = self.handles.handle.mmap # #2442 kwds["allow_leading_cols"] = self.index_col is not False @@ -1863,7 +1849,7 @@ def __init__(self, src, **kwds): self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) kwds["usecols"] = self.usecols - self._reader = parsers.TextReader(src, **kwds) + self._reader = parsers.TextReader(self.handles.handle, **kwds) self.unnamed_cols = self._reader.unnamed_cols passed_names = self.names is None @@ -1942,11 +1928,10 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 - def close(self): - for f in self.handles: - f.close() + def close(self) -> None: + super().close() - # close additional handles opened by C parser (for compression) + # close additional handles opened by C parser try: self._reader.close() except ValueError: @@ -2237,20 +2222,19 @@ def __init__(self, f, **kwds): self.comment = kwds["comment"] self._comment_lines = [] - f, handles = get_handle( + self.handles = get_handle( f, "r", encoding=self.encoding, compression=self.compression, memory_map=self.memory_map, ) - self.handles.extend(handles) # Set self.data to something that can read lines. - if hasattr(f, "readline"): - self._make_reader(f) + if hasattr(self.handles.handle, "readline"): + self._make_reader(self.handles.handle) else: - self.data = f + self.data = self.handles.handle # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 426a40a65b522..6fa044b4651a5 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -92,25 +92,18 @@ def to_pickle( mode="wb", storage_options=storage_options, ) - f, fh = get_handle( + handles = get_handle( ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: - pickle.dump(obj, f, protocol=protocol) + pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] finally: - if f != filepath_or_buffer: - # do not close user-provided file objects GH 35679 - f.close() - for _f in fh: - _f.close() - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - try: - ioargs.filepath_or_buffer.close() - except ValueError: - pass + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + ioargs.close() def read_pickle( @@ -193,7 +186,7 @@ def read_pickle( ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) - f, fh = get_handle( + handles = get_handle( ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False ) @@ -208,24 +201,17 @@ def read_pickle( with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) - return pickle.load(f) + return pickle.load(handles.handle) # type: ignore[arg-type] except excs_to_catch: # e.g. # "No module named 'pandas.core.sparse.series'" # "Can't get attribute '__nat_unpickle' on Optional[Shape]: try: ndim = self.ndim diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 989036917b265..e9b74199cbc42 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -16,7 +16,7 @@ from collections import abc from datetime import datetime, timedelta import struct -from typing import IO, Any, Union +from typing import IO, Any, Union, cast import numpy as np @@ -131,8 +131,6 @@ class SAS7BDATReader(ReaderBase, abc.Iterator): bytes. """ - _path_or_buf: IO[Any] - def __init__( self, path_or_buf, @@ -170,14 +168,12 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer - if isinstance(path_or_buf, str): - buf = open(path_or_buf, "rb") - self.handle = buf - else: - buf = path_or_buf + self.ioargs = get_filepath_or_buffer(path_or_buf) + if isinstance(self.ioargs.filepath_or_buffer, str): + self.ioargs.filepath_or_buffer = open(path_or_buf, "rb") + self.ioargs.should_close = True - self._path_or_buf: IO[Any] = buf + self._path_or_buf = cast(IO[Any], self.ioargs.filepath_or_buffer) try: self._get_properties() @@ -202,10 +198,7 @@ def column_types(self): return np.asarray(self._column_types, dtype=np.dtype("S1")) def close(self): - try: - self.handle.close() - except AttributeError: - pass + self.ioargs.close() def _get_properties(self): diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 2a48abe9fbd63..4303cef2df60d 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -10,6 +10,7 @@ from collections import abc from datetime import datetime import struct +from typing import IO, cast import warnings import numpy as np @@ -252,17 +253,13 @@ def __init__( self._index = index self._chunksize = chunksize - if isinstance(filepath_or_buffer, str): - filepath_or_buffer = get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding - ).filepath_or_buffer + self.ioargs = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) - if isinstance(filepath_or_buffer, (str, bytes)): - self.filepath_or_buffer = open(filepath_or_buffer, "rb") - else: - # Since xport files include non-text byte sequences, xport files - # should already be opened in binary mode in Python 3. - self.filepath_or_buffer = filepath_or_buffer + if isinstance(self.ioargs.filepath_or_buffer, str): + self.ioargs.filepath_or_buffer = open(self.ioargs.filepath_or_buffer, "rb") + self.ioargs.should_close = True + + self.filepath_or_buffer = cast(IO[bytes], self.ioargs.filepath_or_buffer) try: self._read_header() @@ -271,7 +268,7 @@ def __init__( raise def close(self): - self.filepath_or_buffer.close() + self.ioargs.close() def _get_row(self): return self.filepath_or_buffer.read(80).decode() diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index caf53b5be971a..446e2daaa1f9c 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -139,5 +139,4 @@ def read_sas( try: return reader.read() finally: - if ioargs.should_close: - reader.close() + ioargs.close() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cec73ceb17f09..7c7997f128086 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -16,18 +16,7 @@ from pathlib import Path import struct import sys -from typing import ( - Any, - AnyStr, - BinaryIO, - Dict, - List, - Optional, - Sequence, - Tuple, - Union, - cast, -) +from typing import Any, AnyStr, Dict, List, Optional, Sequence, Tuple, Union, cast import warnings from dateutil.relativedelta import relativedelta @@ -35,7 +24,13 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions +from pandas._typing import ( + Buffer, + CompressionOptions, + FilePathOrBuffer, + Label, + StorageOptions, +) from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -58,7 +53,12 @@ from pandas.core.indexes.base import Index from pandas.core.series import Series -from pandas.io.common import get_filepath_or_buffer, get_handle, stringify_path +from pandas.io.common import ( + IOHandles, + get_filepath_or_buffer, + get_handle, + stringify_path, +) _version_error = ( "Version of given Stata file is {version}. pandas supports importing " @@ -1062,19 +1062,20 @@ def __init__( self._lines_read = 0 self._native_byteorder = _set_endianness(sys.byteorder) - path_or_buf = stringify_path(path_or_buf) - if isinstance(path_or_buf, str): - path_or_buf = get_filepath_or_buffer( - path_or_buf, storage_options=storage_options - ).filepath_or_buffer - - if isinstance(path_or_buf, (str, bytes)): - self.path_or_buf = open(path_or_buf, "rb") + self.ioargs = get_filepath_or_buffer( + path_or_buf, storage_options=storage_options + ) + + if isinstance(self.ioargs.filepath_or_buffer, (str, bytes)): + self.ioargs.filepath_or_buffer = open(self.ioargs.filepath_or_buffer, "rb") + self.ioargs.should_close = True elif hasattr(path_or_buf, "read"): # Copy to BytesIO, and ensure no encoding - pb: Any = path_or_buf - contents = pb.read() - self.path_or_buf = BytesIO(contents) + contents = self.ioargs.filepath_or_buffer.read() + self.ioargs.close() + self.ioargs.filepath_or_buffer = BytesIO(contents) # type: ignore[arg-type] + self.ioargs.should_close = True + self.path_or_buf = cast(BytesIO, self.ioargs.filepath_or_buffer) self._read_header() self._setup_dtype() @@ -1089,10 +1090,7 @@ def __exit__(self, exc_type, exc_value, traceback) -> None: def close(self) -> None: """ close the handle if its open """ - try: - self.path_or_buf.close() - except OSError: - pass + self.ioargs.close() def _set_encoding(self) -> None: """ @@ -1938,7 +1936,7 @@ def _open_file_binary_write( fname: FilePathOrBuffer, compression: CompressionOptions, storage_options: StorageOptions = None, -) -> Tuple[BinaryIO, bool, CompressionOptions]: +) -> Tuple[IOHandles, CompressionOptions]: """ Open a binary file or no-op if file-like. @@ -1958,34 +1956,22 @@ def _open_file_binary_write( docs for the set of allowed keys and values .. versionadded:: 1.2.0 - - Returns - ------- - file : file-like object - File object supporting write - own : bool - True if the file was created, otherwise False """ - if hasattr(fname, "write"): - # See https://github.com/python/mypy/issues/1424 for hasattr challenges - # error: Incompatible return value type (got "Tuple[Union[str, Path, - # IO[Any]], bool, None]", expected "Tuple[BinaryIO, bool, Union[str, - # Mapping[str, str], None]]") - return fname, False, None # type: ignore[return-value] - elif isinstance(fname, (str, Path)): - # Extract compression mode as given, if dict - ioargs = get_filepath_or_buffer( - fname, mode="wb", compression=compression, storage_options=storage_options - ) - f, _ = get_handle( - ioargs.filepath_or_buffer, - "wb", - compression=ioargs.compression, - is_text=False, - ) - return f, True, ioargs.compression - else: - raise TypeError("fname must be a binary file, buffer or path-like.") + ioargs = get_filepath_or_buffer( + fname, mode="wb", compression=compression, storage_options=storage_options + ) + handles = get_handle( + ioargs.filepath_or_buffer, + "wb", + compression=ioargs.compression, + is_text=False, + ) + if ioargs.filepath_or_buffer != fname and not isinstance( + ioargs.filepath_or_buffer, str + ): + # add handle created by get_filepath_or_buffer + handles.created_handles.append(ioargs.filepath_or_buffer) + return handles, ioargs.compression def _set_endianness(endianness: str) -> str: @@ -2236,9 +2222,8 @@ def __init__( self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels - self._own_file = True self._compression = compression - self._output_file: Optional[BinaryIO] = None + self._output_file: Optional[Buffer] = None # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) self.storage_options = storage_options @@ -2249,21 +2234,20 @@ def __init__( self._fname = stringify_path(fname) self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} self._converted_names: Dict[Label, str] = {} - self._file: Optional[BinaryIO] = None def _write(self, to_write: str) -> None: """ Helper to call encode before writing to file for Python 3 compat. """ - assert self._file is not None - self._file.write(to_write.encode(self._encoding)) + self.handles.handle.write( + to_write.encode(self._encoding) # type: ignore[arg-type] + ) def _write_bytes(self, value: bytes) -> None: """ Helper to assert file is open before writing. """ - assert self._file is not None - self._file.write(value) + self.handles.handle.write(value) # type: ignore[arg-type] def _prepare_categoricals(self, data: DataFrame) -> DataFrame: """ @@ -2527,12 +2511,14 @@ def _encode_strings(self) -> None: self.data[col] = encoded def write_file(self) -> None: - self._file, self._own_file, compression = _open_file_binary_write( + self.handles, compression = _open_file_binary_write( self._fname, self._compression, storage_options=self.storage_options ) if compression is not None: - self._output_file = self._file - self._file = BytesIO() + # ZipFile creates a file (with the same name) for each write call. + # Write it first into a buffer and then write the buffer to the ZipFile. + self._output_file = self.handles.handle + self.handles.handle = BytesIO() try: self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) self._write_map() @@ -2552,10 +2538,9 @@ def write_file(self) -> None: self._write_map() except Exception as exc: self._close() - if self._own_file: + if isinstance(self._fname, (str, Path)): try: - if isinstance(self._fname, (str, Path)): - os.unlink(self._fname) + os.unlink(self._fname) except OSError: warnings.warn( f"This save was not successful but {self._fname} could not " @@ -2571,24 +2556,18 @@ def _close(self) -> None: Close the file if it was created by the writer. If a buffer or file-like object was passed in, for example a GzipFile, - then leave this file open for the caller to close. In either case, - attempt to flush the file contents to ensure they are written to disk - (if supported) + then leave this file open for the caller to close. """ - # Some file-like objects might not support flush - assert self._file is not None + # write compression if self._output_file is not None: - assert isinstance(self._file, BytesIO) - bio = self._file + assert isinstance(self.handles.handle, BytesIO) + bio = self.handles.handle bio.seek(0) - self._file = self._output_file - self._file.write(bio.read()) - try: - self._file.flush() - except AttributeError: - pass - if self._own_file: - self._file.close() + self.handles.handle = self._output_file + self.handles.handle.write(bio.read()) # type: ignore[arg-type] + bio.close() + # close any created handles + self.handles.close() def _write_map(self) -> None: """No-op, future compatibility""" @@ -3140,8 +3119,8 @@ def _tag(val: Union[str, bytes], tag: str) -> bytes: def _update_map(self, tag: str) -> None: """Update map location for tag with file position""" - assert self._file is not None - self._map[tag] = self._file.tell() + assert self.handles.handle is not None + self._map[tag] = self.handles.handle.tell() def _write_header( self, @@ -3208,12 +3187,11 @@ def _write_map(self) -> None: the map with 0s. The second call writes the final map locations when all blocks have been written. """ - assert self._file is not None if not self._map: self._map = dict( ( ("stata_data", 0), - ("map", self._file.tell()), + ("map", self.handles.handle.tell()), ("variable_types", 0), ("varnames", 0), ("sortlist", 0), @@ -3229,7 +3207,7 @@ def _write_map(self) -> None: ) ) # Move to start of map - self._file.seek(self._map["map"]) + self.handles.handle.seek(self._map["map"]) bio = BytesIO() for val in self._map.values(): bio.write(struct.pack(self._byteorder + "Q", val)) diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index b919728971505..b2c7b2610845c 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -1,4 +1,14 @@ -# being a bit too dynamic +from typing import ( + TYPE_CHECKING, + Collection, + Dict, + Iterator, + List, + Optional, + Sequence, + Union, + cast, +) import warnings import matplotlib.cm as cm @@ -9,92 +19,256 @@ import pandas.core.common as com +if TYPE_CHECKING: + from matplotlib.colors import Colormap + + +Color = Union[str, Sequence[float]] + def get_standard_colors( - num_colors: int, colormap=None, color_type: str = "default", color=None + num_colors: int, + colormap: Optional["Colormap"] = None, + color_type: str = "default", + color: Optional[Union[Dict[str, Color], Color, Collection[Color]]] = None, ): - import matplotlib.pyplot as plt + """ + Get standard colors based on `colormap`, `color_type` or `color` inputs. + + Parameters + ---------- + num_colors : int + Minimum number of colors to be returned. + Ignored if `color` is a dictionary. + colormap : :py:class:`matplotlib.colors.Colormap`, optional + Matplotlib colormap. + When provided, the resulting colors will be derived from the colormap. + color_type : {"default", "random"}, optional + Type of colors to derive. Used if provided `color` and `colormap` are None. + Ignored if either `color` or `colormap` are not None. + color : dict or str or sequence, optional + Color(s) to be used for deriving sequence of colors. + Can be either be a dictionary, or a single color (single color string, + or sequence of floats representing a single color), + or a sequence of colors. + + Returns + ------- + dict or list + Standard colors. Can either be a mapping if `color` was a dictionary, + or a list of colors with a length of `num_colors` or more. + + Warns + ----- + UserWarning + If both `colormap` and `color` are provided. + Parameter `color` will override. + """ + if isinstance(color, dict): + return color + + colors = _derive_colors( + color=color, + colormap=colormap, + color_type=color_type, + num_colors=num_colors, + ) + + return _cycle_colors(colors, num_colors=num_colors) + + +def _derive_colors( + *, + color: Optional[Union[Color, Collection[Color]]], + colormap: Optional[Union[str, "Colormap"]], + color_type: str, + num_colors: int, +) -> List[Color]: + """ + Derive colors from either `colormap`, `color_type` or `color` inputs. + + Get a list of colors either from `colormap`, or from `color`, + or from `color_type` (if both `colormap` and `color` are None). + + Parameters + ---------- + color : str or sequence, optional + Color(s) to be used for deriving sequence of colors. + Can be either be a single color (single color string, or sequence of floats + representing a single color), or a sequence of colors. + colormap : :py:class:`matplotlib.colors.Colormap`, optional + Matplotlib colormap. + When provided, the resulting colors will be derived from the colormap. + color_type : {"default", "random"}, optional + Type of colors to derive. Used if provided `color` and `colormap` are None. + Ignored if either `color` or `colormap`` are not None. + num_colors : int + Number of colors to be extracted. + Returns + ------- + list + List of colors extracted. + + Warns + ----- + UserWarning + If both `colormap` and `color` are provided. + Parameter `color` will override. + """ if color is None and colormap is not None: - if isinstance(colormap, str): - cmap = colormap - colormap = cm.get_cmap(colormap) - if colormap is None: - raise ValueError(f"Colormap {cmap} is not recognized") - colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] + return _get_colors_from_colormap(colormap, num_colors=num_colors) elif color is not None: if colormap is not None: warnings.warn( "'color' and 'colormap' cannot be used simultaneously. Using 'color'" ) - colors = ( - list(color) - if is_list_like(color) and not isinstance(color, dict) - else color - ) + return _get_colors_from_color(color) else: - if color_type == "default": - # need to call list() on the result to copy so we don't - # modify the global rcParams below - try: - colors = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] - except KeyError: - colors = list(plt.rcParams.get("axes.color_cycle", list("bgrcmyk"))) - if isinstance(colors, str): - colors = list(colors) - - colors = colors[0:num_colors] - elif color_type == "random": - - def random_color(column): - """ Returns a random color represented as a list of length 3""" - # GH17525 use common._random_state to avoid resetting the seed - rs = com.random_state(column) - return rs.rand(3).tolist() - - colors = [random_color(num) for num in range(num_colors)] - else: - raise ValueError("color_type must be either 'default' or 'random'") + return _get_colors_from_color_type(color_type, num_colors=num_colors) - if isinstance(colors, str) and _is_single_color(colors): - # GH #36972 - colors = [colors] - # Append more colors by cycling if there is not enough color. - # Extra colors will be ignored by matplotlib if there are more colors - # than needed and nothing needs to be done here. +def _cycle_colors(colors: List[Color], num_colors: int) -> List[Color]: + """Append more colors by cycling if there is not enough color. + + Extra colors will be ignored by matplotlib if there are more colors + than needed and nothing needs to be done here. + """ if len(colors) < num_colors: - try: - multiple = num_colors // len(colors) - 1 - except ZeroDivisionError: - raise ValueError("Invalid color argument: ''") + multiple = num_colors // len(colors) - 1 mod = num_colors % len(colors) - colors += multiple * colors colors += colors[:mod] return colors -def _is_single_color(color: str) -> bool: - """Check if ``color`` is a single color. +def _get_colors_from_colormap( + colormap: Union[str, "Colormap"], + num_colors: int, +) -> List[Color]: + """Get colors from colormap.""" + colormap = _get_cmap_instance(colormap) + return [colormap(num) for num in np.linspace(0, 1, num=num_colors)] + + +def _get_cmap_instance(colormap: Union[str, "Colormap"]) -> "Colormap": + """Get instance of matplotlib colormap.""" + if isinstance(colormap, str): + cmap = colormap + colormap = cm.get_cmap(colormap) + if colormap is None: + raise ValueError(f"Colormap {cmap} is not recognized") + return colormap + + +def _get_colors_from_color( + color: Union[Color, Collection[Color]], +) -> List[Color]: + """Get colors from user input color.""" + if len(color) == 0: + raise ValueError(f"Invalid color argument: {color}") + + if _is_single_color(color): + color = cast(Color, color) + return [color] + + color = cast(Collection[Color], color) + return list(_gen_list_of_colors_from_iterable(color)) + + +def _is_single_color(color: Union[Color, Collection[Color]]) -> bool: + """Check if `color` is a single color, not a sequence of colors. + + Single color is of these kinds: + - Named color "red", "C0", "firebrick" + - Alias "g" + - Sequence of floats, such as (0.1, 0.2, 0.3) or (0.1, 0.2, 0.3, 0.4). + + See Also + -------- + _is_single_string_color + """ + if isinstance(color, str) and _is_single_string_color(color): + # GH #36972 + return True + + if _is_floats_color(color): + return True + + return False + + +def _gen_list_of_colors_from_iterable(color: Collection[Color]) -> Iterator[Color]: + """ + Yield colors from string of several letters or from collection of colors. + """ + for x in color: + if _is_single_color(x): + yield x + else: + raise ValueError(f"Invalid color {x}") + + +def _is_floats_color(color: Union[Color, Collection[Color]]) -> bool: + """Check if color comprises a sequence of floats representing color.""" + return bool( + is_list_like(color) + and (len(color) == 3 or len(color) == 4) + and all(isinstance(x, (int, float)) for x in color) + ) + + +def _get_colors_from_color_type(color_type: str, num_colors: int) -> List[Color]: + """Get colors from user input color type.""" + if color_type == "default": + return _get_default_colors(num_colors) + elif color_type == "random": + return _get_random_colors(num_colors) + else: + raise ValueError("color_type must be either 'default' or 'random'") + + +def _get_default_colors(num_colors: int) -> List[Color]: + """Get `num_colors` of default colors from matplotlib rc params.""" + import matplotlib.pyplot as plt + + colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]] + return colors[0:num_colors] + + +def _get_random_colors(num_colors: int) -> List[Color]: + """Get `num_colors` of random colors.""" + return [_random_color(num) for num in range(num_colors)] + + +def _random_color(column: int) -> List[float]: + """Get a random color represented as a list of length 3""" + # GH17525 use common._random_state to avoid resetting the seed + rs = com.random_state(column) + return rs.rand(3).tolist() + + +def _is_single_string_color(color: Color) -> bool: + """Check if `color` is a single string color. - Examples of single colors: + Examples of single string colors: - 'r' - 'g' - 'red' - 'green' - 'C3' + - 'firebrick' Parameters ---------- - color : string - Color string. + color : Color + Color string or sequence of floats. Returns ------- bool - True if ``color`` looks like a valid color. + True if `color` looks like a valid color. False otherwise. """ conv = matplotlib.colors.ColorConverter() diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index cefd2ae7a9ddb..b0b8f1345e4d3 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -173,7 +173,26 @@ class TestDatetime64SeriesComparison: ) @pytest.mark.parametrize("reverse", [True, False]) @pytest.mark.parametrize("dtype", [None, object]) - def test_nat_comparisons(self, dtype, index_or_series, reverse, pair): + @pytest.mark.parametrize( + "op, expected", + [ + (operator.eq, Series([False, False, True])), + (operator.ne, Series([True, True, False])), + (operator.lt, Series([False, False, False])), + (operator.gt, Series([False, False, False])), + (operator.ge, Series([False, False, True])), + (operator.le, Series([False, False, True])), + ], + ) + def test_nat_comparisons( + self, + dtype, + index_or_series, + reverse, + pair, + op, + expected, + ): box = index_or_series l, r = pair if reverse: @@ -182,25 +201,10 @@ def test_nat_comparisons(self, dtype, index_or_series, reverse, pair): left = Series(l, dtype=dtype) right = box(r, dtype=dtype) - # Series, Index - expected = Series([False, False, True]) - tm.assert_series_equal(left == right, expected) + result = op(left, right) - expected = Series([True, True, False]) - tm.assert_series_equal(left != right, expected) - - expected = Series([False, False, False]) - tm.assert_series_equal(left < right, expected) - - expected = Series([False, False, False]) - tm.assert_series_equal(left > right, expected) - - expected = Series([False, False, True]) - tm.assert_series_equal(left >= right, expected) - - expected = Series([False, False, True]) - tm.assert_series_equal(left <= right, expected) + tm.assert_series_equal(result, expected) def test_comparison_invalid(self, tz_naive_fixture, box_with_array): # GH#4968 diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 47ce9cb4089f9..deafa22a6e8eb 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -8,34 +8,45 @@ class TestCategoricalDtypes: - def test_is_equal_dtype(self): + def test_is_dtype_equal_deprecated(self): + # GH#37545 + c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) + + with tm.assert_produces_warning(FutureWarning): + c1.is_dtype_equal(c1) + + def test_categories_match_up_to_permutation(self): # test dtype comparisons between cats c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) - assert c1.is_dtype_equal(c1) - assert c2.is_dtype_equal(c2) - assert c3.is_dtype_equal(c3) - assert c1.is_dtype_equal(c2) - assert not c1.is_dtype_equal(c3) - assert not c1.is_dtype_equal(Index(list("aabca"))) - assert not c1.is_dtype_equal(c1.astype(object)) - assert c1.is_dtype_equal(CategoricalIndex(c1)) - assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab"))) - assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) + assert c1._categories_match_up_to_permutation(c1) + assert c2._categories_match_up_to_permutation(c2) + assert c3._categories_match_up_to_permutation(c3) + assert c1._categories_match_up_to_permutation(c2) + assert not c1._categories_match_up_to_permutation(c3) + assert not c1._categories_match_up_to_permutation(Index(list("aabca"))) + assert not c1._categories_match_up_to_permutation(c1.astype(object)) + assert c1._categories_match_up_to_permutation(CategoricalIndex(c1)) + assert c1._categories_match_up_to_permutation( + CategoricalIndex(c1, categories=list("cab")) + ) + assert not c1._categories_match_up_to_permutation( + CategoricalIndex(c1, ordered=True) + ) # GH 16659 s1 = Series(c1) s2 = Series(c2) s3 = Series(c3) - assert c1.is_dtype_equal(s1) - assert c2.is_dtype_equal(s2) - assert c3.is_dtype_equal(s3) - assert c1.is_dtype_equal(s2) - assert not c1.is_dtype_equal(s3) - assert not c1.is_dtype_equal(s1.astype(object)) + assert c1._categories_match_up_to_permutation(s1) + assert c2._categories_match_up_to_permutation(s2) + assert c3._categories_match_up_to_permutation(s3) + assert c1._categories_match_up_to_permutation(s2) + assert not c1._categories_match_up_to_permutation(s3) + assert not c1._categories_match_up_to_permutation(s1.astype(object)) def test_set_dtype_same(self): c = Categorical(["a", "b", "c"]) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 21bea9356dcf0..364c290edc46c 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -60,7 +60,10 @@ def test_set_item_nan(self): ), (dict(), "Must specify a fill 'value' or 'method'."), (dict(method="bad"), "Invalid fill method. Expecting .* bad"), - (dict(value=Series([1, 2, 3, 4, "a"])), "fill value must be in categories"), + ( + dict(value=Series([1, 2, 3, 4, "a"])), + "Cannot setitem on a Categorical with a new category", + ), ], ) def test_fillna_raises(self, fillna_kwargs, msg): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index f621479e4f311..ec20c829f1544 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -191,10 +191,11 @@ def test_unbox_scalar(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") result = arr._unbox_scalar(arr[0]) - assert isinstance(result, int) + expected = arr._data.dtype.type + assert isinstance(result, expected) result = arr._unbox_scalar(pd.NaT) - assert isinstance(result, int) + assert isinstance(result, expected) msg = f"'value' should be a {self.dtype.__name__}." with pytest.raises(ValueError, match=msg): @@ -328,11 +329,41 @@ def test_iter_2d(self, arr1d): data2d = arr1d._data[:3, np.newaxis] arr2d = type(arr1d)._simple_new(data2d, dtype=arr1d.dtype) result = list(arr2d) + assert len(result) == 3 for x in result: assert isinstance(x, type(arr1d)) assert x.ndim == 1 assert x.dtype == arr1d.dtype + def test_repr_2d(self, arr1d): + data2d = arr1d._data[:3, np.newaxis] + arr2d = type(arr1d)._simple_new(data2d, dtype=arr1d.dtype) + + result = repr(arr2d) + + if isinstance(arr2d, TimedeltaArray): + expected = ( + f"<{type(arr2d).__name__}>\n" + "[\n" + f"['{arr1d[0]._repr_base()}'],\n" + f"['{arr1d[1]._repr_base()}'],\n" + f"['{arr1d[2]._repr_base()}']\n" + "]\n" + f"Shape: (3, 1), dtype: {arr1d.dtype}" + ) + else: + expected = ( + f"<{type(arr2d).__name__}>\n" + "[\n" + f"['{arr1d[0]}'],\n" + f"['{arr1d[1]}'],\n" + f"['{arr1d[2]}']\n" + "]\n" + f"Shape: (3, 1), dtype: {arr1d.dtype}" + ) + + assert result == expected + def test_setitem(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py new file mode 100644 index 0000000000000..13dc82d779f95 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_dict_compat.py @@ -0,0 +1,14 @@ +import numpy as np + +from pandas.core.dtypes.cast import dict_compat + +from pandas import Timestamp + + +def test_dict_compat(): + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert dict_compat(data_datetime64) == expected + assert dict_compat(expected) == expected + assert dict_compat(data_unchanged) == data_unchanged diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index e973b1247941f..29a59cdefbd83 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -447,7 +447,7 @@ def test_repeat(self, data, repeats, as_series, use_numpy): @pytest.mark.parametrize( "repeats, kwargs, error, msg", [ - (2, dict(axis=1), ValueError, "'axis"), + (2, dict(axis=1), ValueError, "axis"), (-1, dict(), ValueError, "negative"), ([1, 2], dict(), ValueError, "shape"), (2, dict(foo="bar"), TypeError, "'foo'"), diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 03498b278f890..162035b53d68d 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -356,12 +356,17 @@ def test_apply_reduce_Series(self, float_frame): result = float_frame.apply(np.mean, axis=1) tm.assert_series_equal(result, expected) - def test_apply_reduce_rows_to_dict(self): - # GH 25196 - data = DataFrame([[1, 2], [3, 4]]) - expected = Series([{0: 1, 1: 3}, {0: 2, 1: 4}]) - result = data.apply(dict) - tm.assert_series_equal(result, expected) + def test_apply_reduce_to_dict(self): + # GH 25196 37544 + data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"]) + + result0 = data.apply(dict, axis=0) + expected0 = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) + tm.assert_series_equal(result0, expected0) + + result1 = data.apply(dict, axis=1) + expected1 = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) + tm.assert_series_equal(result1, expected1) def test_apply_differently_indexed(self): df = DataFrame(np.random.randn(20, 10)) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 58f0e5bc1ad39..9eaa0d0ae6876 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1327,32 +1327,6 @@ def test_getitem_list_duplicates(self): expected = df.iloc[:, 2:] tm.assert_frame_equal(result, expected) - def test_set_value_with_index_dtype_change(self): - df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) - - # this is actually ambiguous as the 2 is interpreted as a positional - # so column is not created - df = df_orig.copy() - df._set_value("C", 2, 1.0) - assert list(df.index) == list(df_orig.index) + ["C"] - # assert list(df.columns) == list(df_orig.columns) + [2] - - df = df_orig.copy() - df.loc["C", 2] = 1.0 - assert list(df.index) == list(df_orig.index) + ["C"] - # assert list(df.columns) == list(df_orig.columns) + [2] - - # create both new - df = df_orig.copy() - df._set_value("C", "D", 1.0) - assert list(df.index) == list(df_orig.index) + ["C"] - assert list(df.columns) == list(df_orig.columns) + ["D"] - - df = df_orig.copy() - df.loc["C", "D"] = 1.0 - assert list(df.index) == list(df_orig.index) + ["C"] - assert list(df.columns) == list(df_orig.columns) + ["D"] - # TODO: rename? remove? def test_single_element_ix_dont_upcast(self, float_frame): float_frame["E"] = 1 diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index 484e2d544197e..84def57f6b6e0 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -3,7 +3,7 @@ from pandas.core.dtypes.common import is_float_dtype -from pandas import isna +from pandas import DataFrame, isna class TestSetValue: @@ -38,3 +38,29 @@ def test_set_value_resize(self, float_frame): msg = "could not convert string to float: 'sam'" with pytest.raises(ValueError, match=msg): res._set_value("foobar", "baz", "sam") + + def test_set_value_with_index_dtype_change(self): + df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) + + # this is actually ambiguous as the 2 is interpreted as a positional + # so column is not created + df = df_orig.copy() + df._set_value("C", 2, 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] + # assert list(df.columns) == list(df_orig.columns) + [2] + + df = df_orig.copy() + df.loc["C", 2] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] + # assert list(df.columns) == list(df_orig.columns) + [2] + + # create both new + df = df_orig.copy() + df._set_value("C", "D", 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] + + df = df_orig.copy() + df.loc["C", "D"] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 55465dffd2027..e1ce10970f07b 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -185,6 +185,53 @@ def test_setitem_extension_types(self, obj, dtype): tm.assert_frame_equal(df, expected) + def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self): + # GH#7492 + data_ns = np.array([1, "nat"], dtype="datetime64[ns]") + result = Series(data_ns).to_frame() + result["new"] = data_ns + expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]") + tm.assert_frame_equal(result, expected) + + # OutOfBoundsDatetime error shouldn't occur + data_s = np.array([1, "nat"], dtype="datetime64[s]") + result["new"] = data_s + expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_setitem_datetime64_col_other_units(self, unit): + # Check that non-nano dt64 values get cast to dt64 on setitem + # into a not-yet-existing column + n = 100 + + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) + ex_vals = vals.astype("datetime64[ns]") + + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df[unit] = vals + + assert df[unit].dtype == np.dtype("M8[ns]") + assert (df[unit].values == ex_vals).all() + + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_setitem_existing_datetime64_col_other_units(self, unit): + # Check that non-nano dt64 values get cast to dt64 on setitem + # into an already-existing dt64 column + n = 100 + + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) + ex_vals = vals.astype("datetime64[ns]") + + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df["dates"] = np.arange(n, dtype=np.int64).view("M8[ns]") + + # We overwrite existing dt64 column with new, non-nano dt64 vals + df["dates"] = vals + assert (df["dates"].values == ex_vals).all() + def test_setitem_dt64tz(self, timezone_frame): df = timezone_frame diff --git a/pandas/tests/frame/test_add_prefix_suffix.py b/pandas/tests/frame/methods/test_add_prefix_suffix.py similarity index 100% rename from pandas/tests/frame/test_add_prefix_suffix.py rename to pandas/tests/frame/methods/test_add_prefix_suffix.py diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index cdcd922949bcf..368ce88abe165 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -74,3 +74,14 @@ def test_asfreq_fillvalue(self): expected_series = ts.asfreq(freq="1S").fillna(9.0) actual_series = ts.asfreq(freq="1S", fill_value=9.0) tm.assert_series_equal(expected_series, actual_series) + + def test_asfreq_with_date_object_index(self, frame_or_series): + rng = date_range("1/1/2000", periods=20) + ts = frame_or_series(np.random.randn(20), index=rng) + + ts2 = ts.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts2.asfreq("4H", method="ffill") + expected = ts.asfreq("4H", method="ffill") + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 70b42976c95a7..6931dd0ea2d4c 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -96,12 +96,16 @@ def test_missing(self, date_range_frame): result = df.asof("1989-12-31") assert isinstance(result.name, Period) + def test_asof_all_nans(self, frame_or_series): + # GH 15713 + # DataFrame/Series is all nans + result = frame_or_series([np.nan]).asof([0]) + expected = frame_or_series([np.nan]) + tm.assert_equal(result, expected) + def test_all_nans(self, date_range_frame): # GH 15713 # DataFrame is all nans - result = DataFrame([np.nan]).asof([0]) - expected = DataFrame([np.nan]) - tm.assert_frame_equal(result, expected) # testing non-default indexes, multiple inputs N = 150 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index d3f256259b15f..f05c90f37ea8a 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -587,3 +587,27 @@ def test_astype_ignores_errors_for_extension_dtypes(self, df, errors): msg = "(Cannot cast)|(could not convert)" with pytest.raises((ValueError, TypeError), match=msg): df.astype(float, errors=errors) + + def test_astype_tz_conversion(self): + # GH 35973 + val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + df = DataFrame(val) + result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) + + expected = df + expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) + def test_astype_tz_object_conversion(self, tz): + # GH 35973 + val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + expected = DataFrame(val) + + # convert expected to object dtype from other tz str (independently tested) + result = expected.astype({"tz": f"datetime64[ns, {tz}]"}) + result = result.astype({"tz": "object"}) + + # do real test: object dtype to a specified tz, different from construction tz. + result = result.astype({"tz": "datetime64[ns, Europe/London]"}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index ac98d632c5dcd..7ac3868e8ddf4 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -4,14 +4,32 @@ import pytest import pytz +from pandas._libs.tslibs import timezones + from pandas import DataFrame, date_range import pandas._testing as tm class TestAtTime: - def test_at_time(self): + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_localized_at_time(self, tzstr, frame_or_series): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range("4/16/2012", "5/1/2012", freq="H") + ts = frame_or_series(np.random.randn(len(rng)), index=rng) + + ts_local = ts.tz_localize(tzstr) + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize(tzstr) + tm.assert_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + def test_at_time(self, frame_or_series): rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + if frame_or_series is not DataFrame: + ts = ts[0] rs = ts.at_time(rng[1]) assert (rs.index.hour == rng[1].hour).all() assert (rs.index.minute == rng[1].minute).all() @@ -19,23 +37,24 @@ def test_at_time(self): result = ts.at_time("9:30") expected = ts.at_time(time(9, 30)) - tm.assert_frame_equal(result, expected) - - result = ts.loc[time(9, 30)] - expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] - - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) + def test_at_time_midnight(self, frame_or_series): # midnight, everything rng = date_range("1/1/2000", "1/31/2000") ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + if frame_or_series is not DataFrame: + ts = ts[0] result = ts.at_time(time(0, 0)) - tm.assert_frame_equal(result, ts) + tm.assert_equal(result, ts) + def test_at_time_nonexistent(self, frame_or_series): # time doesn't exist rng = date_range("1/1/2012", freq="23Min", periods=384) - ts = DataFrame(np.random.randn(len(rng), 2), rng) + ts = DataFrame(np.random.randn(len(rng)), rng) + if frame_or_series is not DataFrame: + ts = ts[0] rs = ts.at_time("16:00") assert len(rs) == 0 @@ -62,12 +81,14 @@ def test_at_time_tz(self): expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) - def test_at_time_raises(self): + def test_at_time_raises(self, frame_or_series): # GH#20725 - df = DataFrame([[1, 2, 3], [4, 5, 6]]) + obj = DataFrame([[1, 2, 3], [4, 5, 6]]) + if frame_or_series is not DataFrame: + obj = obj[0] msg = "Index must be DatetimeIndex" with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex - df.at_time("00:00") + obj.at_time("00:00") @pytest.mark.parametrize("axis", ["index", "columns", 0, 1]) def test_at_time_axis(self, axis): diff --git a/pandas/tests/frame/methods/test_between_time.py b/pandas/tests/frame/methods/test_between_time.py index 19e802d0fa663..73722f36a0b86 100644 --- a/pandas/tests/frame/methods/test_between_time.py +++ b/pandas/tests/frame/methods/test_between_time.py @@ -1,16 +1,73 @@ -from datetime import time +from datetime import datetime, time import numpy as np import pytest -from pandas import DataFrame, date_range +from pandas._libs.tslibs import timezones +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series, date_range import pandas._testing as tm class TestBetweenTime: - def test_between_time(self, close_open_fixture): + @td.skip_if_has_locale + def test_between_time_formats(self, frame_or_series): + # GH#11818 rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + if frame_or_series is Series: + ts = ts[0] + + strings = [ + ("2:00", "2:30"), + ("0200", "0230"), + ("2:00am", "2:30am"), + ("0200am", "0230am"), + ("2:00:00", "2:30:00"), + ("020000", "023000"), + ("2:00:00am", "2:30:00am"), + ("020000am", "023000am"), + ] + expected_length = 28 + + for time_string in strings: + assert len(ts.between_time(*time_string)) == expected_length + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_localized_between_time(self, tzstr, frame_or_series): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range("4/16/2012", "5/1/2012", freq="H") + ts = Series(np.random.randn(len(rng)), index=rng) + if frame_or_series is DataFrame: + ts = ts.to_frame() + + ts_local = ts.tz_localize(tzstr) + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize(tzstr) + tm.assert_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + def test_between_time_types(self, frame_or_series): + # GH11818 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + obj = DataFrame({"A": 0}, index=rng) + if frame_or_series is Series: + obj = obj["A"] + + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" + with pytest.raises(ValueError, match=msg): + obj.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + def test_between_time(self, close_open_fixture, frame_or_series): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + if frame_or_series is not DataFrame: + ts = ts[0] + stime = time(0, 0) etime = time(1, 0) inc_start, inc_end = close_open_fixture @@ -37,11 +94,13 @@ def test_between_time(self, close_open_fixture): result = ts.between_time("00:00", "01:00") expected = ts.between_time(stime, etime) - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) # across midnight rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + if frame_or_series is not DataFrame: + ts = ts[0] stime = time(22, 0) etime = time(9, 0) @@ -65,14 +124,33 @@ def test_between_time(self, close_open_fixture): else: assert (t < etime) or (t >= stime) - def test_between_time_raises(self): + def test_between_time_raises(self, frame_or_series): # GH#20725 - df = DataFrame([[1, 2, 3], [4, 5, 6]]) + obj = DataFrame([[1, 2, 3], [4, 5, 6]]) + if frame_or_series is not DataFrame: + obj = obj[0] + msg = "Index must be DatetimeIndex" with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex - df.between_time(start_time="00:00", end_time="12:00") + obj.between_time(start_time="00:00", end_time="12:00") + + def test_between_time_axis(self, frame_or_series): + # GH#8839 + rng = date_range("1/1/2000", periods=100, freq="10min") + ts = Series(np.random.randn(len(rng)), index=rng) + if frame_or_series is DataFrame: + ts = ts.to_frame() + + stime, etime = ("08:00:00", "09:00:00") + expected_length = 7 + + assert len(ts.between_time(stime, etime)) == expected_length + assert len(ts.between_time(stime, etime, axis=0)) == expected_length + msg = f"No axis named {ts.ndim} for object type {type(ts).__name__}" + with pytest.raises(ValueError, match=msg): + ts.between_time(stime, etime, axis=ts.ndim) - def test_between_time_axis(self, axis): + def test_between_time_axis_aliases(self, axis): # GH#8839 rng = date_range("1/1/2000", periods=100, freq="10min") ts = DataFrame(np.random.randn(len(rng), len(rng))) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 4850c6a50f8a8..08c4293323500 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm @@ -365,3 +365,32 @@ def test_combine_first_string_dtype_only_na(self): {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected) + + +def test_combine_first_with_nan_multiindex(): + # gh-36562 + + mi1 = MultiIndex.from_arrays( + [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] + ) + df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) + mi2 = MultiIndex.from_arrays( + [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] + ) + s = Series([1, 2, 3, 4, 5, 6], index=mi2) + res = df.combine_first(DataFrame({"d": s})) + mi_expected = MultiIndex.from_arrays( + [ + ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], + ], + names=["a", "b"], + ) + expected = DataFrame( + { + "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], + "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], + }, + index=mi_expected, + ) + tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/frame/methods/test_droplevel.py b/pandas/tests/frame/methods/test_droplevel.py index 517905cf23259..ce98704b03106 100644 --- a/pandas/tests/frame/methods/test_droplevel.py +++ b/pandas/tests/frame/methods/test_droplevel.py @@ -1,23 +1,32 @@ +import pytest + from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm class TestDropLevel: - def test_droplevel(self): + def test_droplevel(self, frame_or_series): # GH#20342 - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) - df = df.set_index([0, 1]).rename_axis(["a", "b"]) - df.columns = MultiIndex.from_tuples( + cols = MultiIndex.from_tuples( [("c", "e"), ("d", "f")], names=["level_1", "level_2"] ) + mi = MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"]) + df = DataFrame([[3, 4], [7, 8], [11, 12]], index=mi, columns=cols) + if frame_or_series is not DataFrame: + df = df.iloc[:, 0] # test that dropping of a level in index works expected = df.reset_index("a", drop=True) result = df.droplevel("a", axis="index") - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) - # test that dropping of a level in columns works - expected = df.copy() - expected.columns = Index(["c", "d"], name="level_1") - result = df.droplevel("level_2", axis="columns") - tm.assert_frame_equal(result, expected) + if frame_or_series is DataFrame: + # test that dropping of a level in columns works + expected = df.copy() + expected.columns = Index(["c", "d"], name="level_1") + result = df.droplevel("level_2", axis="columns") + tm.assert_equal(result, expected) + else: + # test that droplevel raises ValueError on axis != 0 + with pytest.raises(ValueError, match="No axis named columns"): + df.droplevel(1, axis="columns") diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index c024390297fec..de2509ed91be2 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -1,4 +1,6 @@ -from pandas import DataFrame +import numpy as np + +from pandas import DataFrame, date_range import pandas._testing as tm @@ -21,3 +23,56 @@ def test_equals_different_blocks(self): tm.assert_frame_equal(df0, df1) assert df0.equals(df1) assert df1.equals(df0) + + def test_equals(self): + # Add object dtype column with nans + index = np.random.random(10) + df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) + df1["text"] = "the sky is so blue. we could use more chocolate.".split() + df1["start"] = date_range("2000-1-1", periods=10, freq="T") + df1["end"] = date_range("2000-1-1", periods=10, freq="D") + df1["diff"] = df1["end"] - df1["start"] + df1["bool"] = np.arange(10) % 3 == 0 + df1.loc[::2] = np.nan + df2 = df1.copy() + assert df1["text"].equals(df2["text"]) + assert df1["start"].equals(df2["start"]) + assert df1["end"].equals(df2["end"]) + assert df1["diff"].equals(df2["diff"]) + assert df1["bool"].equals(df2["bool"]) + assert df1.equals(df2) + assert not df1.equals(object) + + # different dtype + different = df1.copy() + different["floats"] = different["floats"].astype("float32") + assert not df1.equals(different) + + # different index + different_index = -index + different = df2.set_index(different_index) + assert not df1.equals(different) + + # different columns + different = df2.copy() + different.columns = df2.columns[::-1] + assert not df1.equals(different) + + # DatetimeIndex + index = date_range("2000-1-1", periods=10, freq="T") + df1 = df1.set_index(index) + df2 = df1.copy() + assert df1.equals(df2) + + # MultiIndex + df3 = df1.set_index(["text"], append=True) + df2 = df1.set_index(["text"], append=True) + assert df3.equals(df2) + + df2 = df1.set_index(["floats"], append=True) + assert not df3.equals(df2) + + # NaN in index + df3 = df1.set_index(["floats"], append=True) + df2 = df1.set_index(["floats"], append=True) + assert df3.equals(df2) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 9fa1aa65379c5..bbb57da39705b 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -170,7 +170,7 @@ def test_na_actions_categorical(self): res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - msg = "'fill_value=4' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): df.fillna(value={"cats": 4, "vals": "c"}) diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py index 2b3756969acca..d21e1eee54e16 100644 --- a/pandas/tests/frame/methods/test_first_and_last.py +++ b/pandas/tests/frame/methods/test_first_and_last.py @@ -8,56 +8,64 @@ class TestFirst: - def test_first_subset(self): + def test_first_subset(self, frame_or_series): ts = tm.makeTimeDataFrame(freq="12h") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.first("10d") assert len(result) == 20 ts = tm.makeTimeDataFrame(freq="D") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.first("10d") assert len(result) == 10 result = ts.first("3M") expected = ts[:"3/31/2000"] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts.first("21D") expected = ts[:21] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts[:0].first("3M") - tm.assert_frame_equal(result, ts[:0]) + tm.assert_equal(result, ts[:0]) - def test_first_raises(self): + def test_first_last_raises(self, frame_or_series): # GH#20725 - df = DataFrame([[1, 2, 3], [4, 5, 6]]) + obj = DataFrame([[1, 2, 3], [4, 5, 6]]) + if frame_or_series is not DataFrame: + obj = obj[0] + msg = "'first' only supports a DatetimeIndex index" with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex - df.first("1D") + obj.first("1D") + + msg = "'last' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex + obj.last("1D") - def test_last_subset(self): + def test_last_subset(self, frame_or_series): ts = tm.makeTimeDataFrame(freq="12h") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.last("10d") assert len(result) == 20 ts = tm.makeTimeDataFrame(nper=30, freq="D") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.last("10d") assert len(result) == 10 result = ts.last("21D") expected = ts["2000-01-10":] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts.last("21D") expected = ts[-21:] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts[:0].last("3M") - tm.assert_frame_equal(result, ts[:0]) - - def test_last_raises(self): - # GH20725 - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - msg = "'last' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex - df.last("1D") + tm.assert_equal(result, ts[:0]) diff --git a/pandas/tests/frame/methods/test_head_tail.py b/pandas/tests/frame/methods/test_head_tail.py index 93763bc12ce0d..99cb7840c3eb6 100644 --- a/pandas/tests/frame/methods/test_head_tail.py +++ b/pandas/tests/frame/methods/test_head_tail.py @@ -4,6 +4,30 @@ import pandas._testing as tm +def test_head_tail_generic(index, frame_or_series): + # GH#5370 + + ndim = 2 if frame_or_series is DataFrame else 1 + shape = (len(index),) * ndim + vals = np.random.randn(*shape) + obj = frame_or_series(vals, index=index) + + tm.assert_equal(obj.head(), obj.iloc[:5]) + tm.assert_equal(obj.tail(), obj.iloc[-5:]) + + # 0-len + tm.assert_equal(obj.head(0), obj.iloc[0:0]) + tm.assert_equal(obj.tail(0), obj.iloc[0:0]) + + # bounded + tm.assert_equal(obj.head(len(obj) + 1), obj) + tm.assert_equal(obj.tail(len(obj) + 1), obj) + + # neg index + tm.assert_equal(obj.head(-3), obj.head(len(index) - 3)) + tm.assert_equal(obj.tail(-3), obj.tail(len(index) - 3)) + + def test_head_tail(float_frame): tm.assert_frame_equal(float_frame.head(), float_frame[:5]) tm.assert_frame_equal(float_frame.tail(), float_frame[-5:]) @@ -24,6 +48,9 @@ def test_head_tail(float_frame): tm.assert_frame_equal(df.tail(0), df[0:0]) tm.assert_frame_equal(df.head(-1), df.iloc[:-1]) tm.assert_frame_equal(df.tail(-1), df.iloc[1:]) + + +def test_head_tail_empty(): # test empty dataframe empty_df = DataFrame() tm.assert_frame_equal(empty_df.tail(), empty_df) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 92c9f7564a670..56fd633f5f22b 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -71,6 +71,15 @@ def test_reset_index_tz(self, tz_aware_fixture): expected["idx"] = expected["idx"].apply(lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_frame_reset_index_tzaware_index(self, tz): + dr = date_range("2012-06-02", periods=10, tz=tz) + df = DataFrame(np.random.randn(len(dr)), dr) + roundtripped = df.reset_index().set_index("index") + xp = df.index.tz + rs = roundtripped.index.tz + assert xp == rs + def test_reset_index_with_intervals(self): idx = IntervalIndex.from_breaks(np.arange(11), name="x") original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]] diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 5bf1ce508dfc4..3103f6e1ba0b1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1034,11 +1034,12 @@ def test_to_csv_compression(self, df, encoding, compression): tm.assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = get_handle( + handles = get_handle( filename, "w", compression=compression, encoding=encoding ) - with f: - df.to_csv(f, encoding=encoding) + df.to_csv(handles.handle, encoding=encoding) + assert not handles.handle.closed + handles.close() result = pd.read_csv( filename, compression=compression, diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index f1656b46cf356..f8feef7a95eab 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -257,17 +257,30 @@ def test_to_dict_wide(self): assert result == expected def test_to_dict_orient_dtype(self): - # GH#22620 - # Input Data - input_data = {"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["X", "Y", "Z"]} - df = DataFrame(input_data) - # Expected Dtypes - expected = {"a": int, "b": float, "c": str} - # Extracting dtypes out of to_dict operation - for df_dict in df.to_dict("records"): - result = { - "a": type(df_dict["a"]), - "b": type(df_dict["b"]), - "c": type(df_dict["c"]), + # GH22620 & GH21256 + + df = DataFrame( + { + "bool": [True, True, False], + "datetime": [ + datetime(2018, 1, 1), + datetime(2019, 2, 2), + datetime(2020, 3, 3), + ], + "float": [1.0, 2.0, 3.0], + "int": [1, 2, 3], + "str": ["X", "Y", "Z"], } + ) + + expected = { + "int": int, + "float": float, + "str": str, + "datetime": Timestamp, + "bool": bool, + } + + for df_dict in df.to_dict("records"): + result = {col: type(df_dict[col]) for col in list(df.columns)} assert result == expected diff --git a/pandas/tests/frame/methods/test_to_period.py b/pandas/tests/frame/methods/test_to_period.py index 051461b6c554d..e3f3fe9f697a9 100644 --- a/pandas/tests/frame/methods/test_to_period.py +++ b/pandas/tests/frame/methods/test_to_period.py @@ -1,36 +1,87 @@ import numpy as np import pytest -from pandas import DataFrame, date_range, period_range +from pandas import ( + DataFrame, + DatetimeIndex, + PeriodIndex, + Series, + date_range, + period_range, +) import pandas._testing as tm class TestToPeriod: - def test_frame_to_period(self): + def test_to_period(self, frame_or_series): K = 5 - dr = date_range("1/1/2000", "1/1/2001") - pr = period_range("1/1/2000", "1/1/2001") - df = DataFrame(np.random.randn(len(dr), K), index=dr) - df["mix"] = "a" + dr = date_range("1/1/2000", "1/1/2001", freq="D") + obj = DataFrame( + np.random.randn(len(dr), K), index=dr, columns=["A", "B", "C", "D", "E"] + ) + obj["mix"] = "a" + if frame_or_series is Series: + obj = obj["A"] - pts = df.to_period() - exp = df.copy() - exp.index = pr - tm.assert_frame_equal(pts, exp) + pts = obj.to_period() + exp = obj.copy() + exp.index = period_range("1/1/2000", "1/1/2001") + tm.assert_equal(pts, exp) - pts = df.to_period("M") - tm.assert_index_equal(pts.index, exp.index.asfreq("M")) + pts = obj.to_period("M") + exp.index = exp.index.asfreq("M") + tm.assert_equal(pts, exp) + + def test_to_period_without_freq(self, frame_or_series): + # GH#7606 without freq + idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) + exp_idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" + ) + + obj = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) + if frame_or_series is Series: + obj = obj[idx[0]] + expected = obj.copy() + expected.index = exp_idx + tm.assert_equal(obj.to_period(), expected) + + if frame_or_series is DataFrame: + expected = obj.copy() + expected.columns = exp_idx + tm.assert_frame_equal(obj.to_period(axis=1), expected) + + def test_to_period_columns(self): + dr = date_range("1/1/2000", "1/1/2001") + df = DataFrame(np.random.randn(len(dr), 5), index=dr) + df["mix"] = "a" df = df.T pts = df.to_period(axis=1) exp = df.copy() - exp.columns = pr + exp.columns = period_range("1/1/2000", "1/1/2001") tm.assert_frame_equal(pts, exp) pts = df.to_period("M", axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) + def test_to_period_invalid_axis(self): + dr = date_range("1/1/2000", "1/1/2001") + df = DataFrame(np.random.randn(len(dr), 5), index=dr) + df["mix"] = "a" + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.to_period(axis=2) + + def test_to_period_raises(self, index, frame_or_series): + # https://github.com/pandas-dev/pandas/issues/33327 + obj = Series(index=index, dtype=object) + if frame_or_series is DataFrame: + obj = obj.to_frame() + + if not isinstance(index, DatetimeIndex): + msg = f"unsupported Type {type(index).__name__}" + with pytest.raises(TypeError, match=msg): + obj.to_period() diff --git a/pandas/tests/frame/methods/test_to_timestamp.py b/pandas/tests/frame/methods/test_to_timestamp.py index ae7d2827e05a6..e23d12b691b4a 100644 --- a/pandas/tests/frame/methods/test_to_timestamp.py +++ b/pandas/tests/frame/methods/test_to_timestamp.py @@ -6,6 +6,8 @@ from pandas import ( DataFrame, DatetimeIndex, + PeriodIndex, + Series, Timedelta, date_range, period_range, @@ -14,48 +16,70 @@ import pandas._testing as tm +def _get_with_delta(delta, freq="A-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) + + class TestToTimestamp: - def test_frame_to_time_stamp(self): + def test_to_timestamp(self, frame_or_series): K = 5 index = period_range(freq="A", start="1/1/2001", end="12/1/2009") - df = DataFrame(np.random.randn(len(index), K), index=index) - df["mix"] = "a" + obj = DataFrame( + np.random.randn(len(index), K), + index=index, + columns=["A", "B", "C", "D", "E"], + ) + obj["mix"] = "a" + if frame_or_series is Series: + obj = obj["A"] exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") - result = df.to_timestamp("D", "end") + result = obj.to_timestamp("D", "end") tm.assert_index_equal(result.index, exp_index) - tm.assert_numpy_array_equal(result.values, df.values) + tm.assert_numpy_array_equal(result.values, obj.values) + if frame_or_series is Series: + assert result.name == "A" exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") - result = df.to_timestamp("D", "start") + result = obj.to_timestamp("D", "start") tm.assert_index_equal(result.index, exp_index) - def _get_with_delta(delta, freq="A-DEC"): - return date_range( - to_datetime("1/1/2001") + delta, - to_datetime("12/31/2009") + delta, - freq=freq, - ) + result = obj.to_timestamp(how="start") + tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23) - result = df.to_timestamp("H", "end") + result = obj.to_timestamp("H", "end") exp_index = _get_with_delta(delta) exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp("T", "end") + result = obj.to_timestamp("T", "end") exp_index = _get_with_delta(delta) exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - result = df.to_timestamp("S", "end") + result = obj.to_timestamp("S", "end") delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) + def test_to_timestamp_columns(self): + K = 5 + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + df = DataFrame( + np.random.randn(len(index), K), + index=index, + columns=["A", "B", "C", "D", "E"], + ) + df["mix"] = "a" + # columns df = df.T @@ -87,10 +111,6 @@ def _get_with_delta(delta, freq="A-DEC"): exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) - # invalid axis - with pytest.raises(ValueError, match="axis"): - df.to_timestamp(axis=2) - result1 = df.to_timestamp("5t", axis=1) result2 = df.to_timestamp("t", axis=1) expected = date_range("2001-01-01", "2009-01-01", freq="AS") @@ -101,3 +121,34 @@ def _get_with_delta(delta, freq="A-DEC"): # PeriodIndex.to_timestamp always use 'infer' assert result1.columns.freqstr == "AS-JAN" assert result2.columns.freqstr == "AS-JAN" + + def to_timestamp_invalid_axis(self): + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + obj = DataFrame(np.random.randn(len(index), 5), index=index) + + # invalid axis + with pytest.raises(ValueError, match="axis"): + obj.to_timestamp(axis=2) + + def test_to_timestamp_hourly(self, frame_or_series): + + index = period_range(freq="H", start="1/1/2001", end="1/2/2001") + obj = Series(1, index=index, name="foo") + if frame_or_series is not Series: + obj = obj.to_frame() + + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") + result = obj.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + if frame_or_series is Series: + assert result.name == "foo" + + def test_to_timestamp_raises(self, index, frame_or_series): + # GH#33327 + obj = frame_or_series(index=index, dtype=object) + + if not isinstance(index, PeriodIndex): + msg = f"unsupported Type {type(index).__name__}" + with pytest.raises(TypeError, match=msg): + obj.to_timestamp() diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index a5fe5f3a6d5e4..8635168f1eb03 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,45 +1,55 @@ import numpy as np +import pytest -import pandas as pd +from pandas import DataFrame, date_range import pandas._testing as tm class TestTranspose: def test_transpose_tzaware_1col_single_tz(self): # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") - df = pd.DataFrame(dti) + df = DataFrame(dti) assert (df.dtypes == dti.dtype).all() res = df.T assert (res.dtypes == dti.dtype).all() def test_transpose_tzaware_2col_single_tz(self): # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") - df3 = pd.DataFrame({"A": dti, "B": dti}) + df3 = DataFrame({"A": dti, "B": dti}) assert (df3.dtypes == dti.dtype).all() res3 = df3.T assert (res3.dtypes == dti.dtype).all() def test_transpose_tzaware_2col_mixed_tz(self): # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") dti2 = dti.tz_convert("US/Pacific") - df4 = pd.DataFrame({"A": dti, "B": dti2}) + df4 = DataFrame({"A": dti, "B": dti2}) assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() assert (df4.T.dtypes == object).all() tm.assert_frame_equal(df4.T.T, df4) + @pytest.mark.parametrize("tz", [None, "America/New_York"]) + def test_transpose_preserves_dtindex_equality_with_dst(self, tz): + # GH#19970 + idx = date_range("20161101", "20161130", freq="4H", tz=tz) + df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list("ab"), columns=idx) + tm.assert_frame_equal(result, expected) + def test_transpose_object_to_tzaware_mixed_tz(self): # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") dti2 = dti.tz_convert("US/Pacific") # mixed all-tzaware dtypes - df2 = pd.DataFrame([dti, dti2]) + df2 = DataFrame([dti, dti2]) assert (df2.dtypes == object).all() res2 = df2.T assert (res2.dtypes == [dti.dtype, dti2.dtype]).all() @@ -47,7 +57,7 @@ def test_transpose_object_to_tzaware_mixed_tz(self): def test_transpose_uint64(self, uint64_frame): result = uint64_frame.T - expected = pd.DataFrame(uint64_frame.values.T) + expected = DataFrame(uint64_frame.values.T) expected.index = ["A", "B"] tm.assert_frame_equal(result, expected) @@ -63,7 +73,7 @@ def test_transpose_float(self, float_frame): # mixed type index, data = tm.getMixedTypeDict() - mixed = pd.DataFrame(data, index=index) + mixed = DataFrame(data, index=index) mixed_T = mixed.T for col, s in mixed_T.items(): diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 674f482c478a0..c6d6637edc88c 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -2,12 +2,15 @@ import pytest import pandas as pd +from pandas import DataFrame, Series, date_range import pandas._testing as tm class TestDataFrameTruncate: - def test_truncate(self, datetime_frame): + def test_truncate(self, datetime_frame, frame_or_series): ts = datetime_frame[::3] + if frame_or_series is Series: + ts = ts.iloc[:, 0] start, end = datetime_frame.index[3], datetime_frame.index[6] @@ -16,34 +19,41 @@ def test_truncate(self, datetime_frame): # neither specified truncated = ts.truncate() - tm.assert_frame_equal(truncated, ts) + tm.assert_equal(truncated, ts) # both specified expected = ts[1:3] truncated = ts.truncate(start, end) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) truncated = ts.truncate(start_missing, end_missing) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) # start specified expected = ts[1:] truncated = ts.truncate(before=start) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) truncated = ts.truncate(before=start_missing) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) # end specified expected = ts[:3] truncated = ts.truncate(after=end) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) truncated = ts.truncate(after=end_missing) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) + + # corner case, empty series/frame returned + truncated = ts.truncate(after=ts.index[0] - ts.index.freq) + assert len(truncated) == 0 + + truncated = ts.truncate(before=ts.index[-1] + ts.index.freq) + assert len(truncated) == 0 msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" with pytest.raises(ValueError, match=msg): @@ -57,25 +67,35 @@ def test_truncate_copy(self, datetime_frame): truncated.values[:] = 5.0 assert not (datetime_frame.values[5:11] == 5).any() - def test_truncate_nonsortedindex(self): + def test_truncate_nonsortedindex(self, frame_or_series): # GH#17935 - df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + obj = DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + if frame_or_series is Series: + obj = obj["A"] + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): - df.truncate(before=3, after=9) + obj.truncate(before=3, after=9) + + def test_sort_values_nonsortedindex(self): + # TODO: belongs elsewhere? - rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") - ts = pd.DataFrame( + rng = date_range("2011-01-01", "2012-01-01", freq="W") + ts = DataFrame( {"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng ) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): ts.sort_values("A", ascending=False).truncate( before="2011-11", after="2011-12" ) - df = pd.DataFrame( + def test_truncate_nonsortedindex_axis1(self): + # GH#17935 + + df = DataFrame( { 3: np.random.randn(5), 20: np.random.randn(5), @@ -93,27 +113,34 @@ def test_truncate_nonsortedindex(self): [(1, 2, [2, 1]), (None, 2, [2, 1, 0]), (1, None, [3, 2, 1])], ) @pytest.mark.parametrize("klass", [pd.Int64Index, pd.DatetimeIndex]) - def test_truncate_decreasing_index(self, before, after, indices, klass): + def test_truncate_decreasing_index( + self, before, after, indices, klass, frame_or_series + ): # https://github.com/pandas-dev/pandas/issues/33756 idx = klass([3, 2, 1, 0]) if klass is pd.DatetimeIndex: before = pd.Timestamp(before) if before is not None else None after = pd.Timestamp(after) if after is not None else None indices = [pd.Timestamp(i) for i in indices] - values = pd.DataFrame(range(len(idx)), index=idx) + values = frame_or_series(range(len(idx)), index=idx) result = values.truncate(before=before, after=after) expected = values.loc[indices] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) - def test_truncate_multiindex(self): + def test_truncate_multiindex(self, frame_or_series): # GH 34564 mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) - s1 = pd.DataFrame(range(mi.shape[0]), index=mi, columns=["col"]) + s1 = DataFrame(range(mi.shape[0]), index=mi, columns=["col"]) + if frame_or_series is Series: + s1 = s1["col"] + result = s1.truncate(before=2, after=3) - df = pd.DataFrame.from_dict( + df = DataFrame.from_dict( {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} ) expected = df.set_index(["L1", "L2"]) + if frame_or_series is Series: + expected = expected["col"] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index c70e479723644..ecb30cf11319b 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex, Series, date_range +from pandas import DataFrame, Index, MultiIndex, date_range import pandas._testing as tm @@ -89,17 +89,16 @@ def test_tz_convert_and_localize(self, fn): df = DataFrame(index=l0) df = getattr(df, fn)("US/Pacific", level=1) - @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize("copy", [True, False]) - def test_tz_convert_copy_inplace_mutate(self, copy, klass): + def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 - obj = klass( + obj = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz="Europe/Berlin"), ) orig = obj.copy() result = obj.tz_convert("UTC", copy=copy) - expected = klass(np.arange(0, 5), index=obj.index.tz_convert("UTC")) + expected = frame_or_series(np.arange(0, 5), index=obj.index.tz_convert("UTC")) tm.assert_equal(result, expected) tm.assert_equal(obj, orig) assert result.index is not obj.index diff --git a/pandas/tests/frame/methods/test_tz_localize.py b/pandas/tests/frame/methods/test_tz_localize.py index 183b81ca5298e..aa5ab51fe3d8b 100644 --- a/pandas/tests/frame/methods/test_tz_localize.py +++ b/pandas/tests/frame/methods/test_tz_localize.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, date_range import pandas._testing as tm @@ -23,16 +23,15 @@ def test_frame_tz_localize(self): assert result.columns.tz.zone == "UTC" tm.assert_frame_equal(result, expected.T) - @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize("copy", [True, False]) - def test_tz_localize_copy_inplace_mutate(self, copy, klass): + def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 - obj = klass( + obj = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=None) ) orig = obj.copy() result = obj.tz_localize("UTC", copy=copy) - expected = klass( + expected = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz="UTC"), ) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 564a659724768..fb0c5d31f692b 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -1,6 +1,7 @@ import numpy as np +import pytest -from pandas import DataFrame, NaT, Timestamp, date_range +from pandas import DataFrame, NaT, Series, Timestamp, date_range, period_range import pandas._testing as tm @@ -44,6 +45,22 @@ def test_values_duplicates(self): tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("constructor", [date_range, period_range]) + def test_values_casts_datetimelike_to_object(self, constructor): + series = Series(constructor("2000-01-01", periods=10, freq="D")) + + expected = series.astype("object") + + df = DataFrame({"a": series, "b": np.random.randn(len(series))}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + df = DataFrame({"a": series, "b": ["foo"] * len(series)}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + def test_frame_values_with_tz(self): tz = "US/Central" df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 3cd35e900ee06..4bd1d5fa56468 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,7 +1,6 @@ from datetime import datetime import numpy as np -import pytest from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -24,15 +23,6 @@ class TestDataFrameAlterAxes: - def test_set_index_directly(self, float_string_frame): - df = float_string_frame - idx = Index(np.arange(len(df))[::-1]) - - df.index = idx - tm.assert_index_equal(df.index, idx) - with pytest.raises(ValueError, match="Length mismatch"): - df.index = idx[::2] - def test_convert_dti_to_series(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 @@ -101,12 +91,6 @@ def test_convert_dti_to_series(self): df.pop("ts") tm.assert_frame_equal(df, expected) - def test_set_columns(self, float_string_frame): - cols = Index(np.arange(len(float_string_frame.columns))) - float_string_frame.columns = cols - with pytest.raises(ValueError, match="Length mismatch"): - float_string_frame.columns = cols[::2] - def test_dti_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 46e34a7a58ae4..408024e48a35a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2737,6 +2737,35 @@ def test_constructor_list_str_na(self, string_dtype): class TestDataFrameConstructorWithDatetimeTZ: + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_construction_preserves_tzaware_dtypes(self, tz): + # after GH#7822 + # these retain the timezones on dict construction + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") + dr_tz = dr.tz_localize(tz) + df = DataFrame({"A": "foo", "B": dr_tz}, index=dr) + tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo) + assert df["B"].dtype == tz_expected + + # GH#2810 (with timezones) + datetimes_naive = [ts.to_pydatetime() for ts in dr] + datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] + df = DataFrame({"dr": dr}) + df["dr_tz"] = dr_tz + df["datetimes_naive"] = datetimes_naive + df["datetimes_with_tz"] = datetimes_with_tz + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + ], + index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], + ) + tm.assert_series_equal(result, expected) + def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): # GH#25843 tz = tz_aware_fixture diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index c6b1c69442dbc..1c54855ee7bce 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -400,29 +400,6 @@ def check(result, expected=None): result = z.loc[["a", "c", "a"]] check(result, expected) - def test_column_dups_indexing2(self): - - # GH 8363 - # datetime ops with a non-unique index - df = DataFrame( - {"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")}, - index=[2, 2, 3, 3, 4], - ) - result = df.B - df.A - expected = Series(1, index=[2, 2, 3, 3, 4]) - tm.assert_series_equal(result, expected) - - df = DataFrame( - { - "A": date_range("20130101", periods=5), - "B": date_range("20130101 09:00:00", periods=5), - }, - index=[2, 2, 3, 3, 4], - ) - result = df.B - df.A - expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4]) - tm.assert_series_equal(result, expected) - def test_columns_with_dups(self): # GH 3468 related diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py deleted file mode 100644 index 22ffb30324366..0000000000000 --- a/pandas/tests/frame/test_timeseries.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas import DataFrame, to_datetime -import pandas._testing as tm - - -class TestDataFrameTimeSeriesMethods: - @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) - def test_frame_append_datetime64_col_other_units(self, unit): - n = 100 - - ns_dtype = np.dtype("M8[ns]") - - dtype = np.dtype(f"M8[{unit}]") - vals = np.arange(n, dtype=np.int64).view(dtype) - - df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) - df[unit] = vals - - ex_vals = to_datetime(vals.astype("O")).values - - assert df[unit].dtype == ns_dtype - assert (df[unit].values == ex_vals).all() - - @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) - def test_frame_setitem_existing_datetime64_col_other_units(self, unit): - # Test insertion into existing datetime64 column - n = 100 - ns_dtype = np.dtype("M8[ns]") - - df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) - df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype) - - dtype = np.dtype(f"M8[{unit}]") - vals = np.arange(n, dtype=np.int64).view(dtype) - - tmp = df.copy() - - tmp["dates"] = vals - ex_vals = to_datetime(vals.astype("O")).values - - assert (tmp["dates"].values == ex_vals).all() - - def test_datetime_assignment_with_NaT_and_diff_time_units(self): - # GH 7492 - data_ns = np.array([1, "nat"], dtype="datetime64[ns]") - result = pd.Series(data_ns).to_frame() - result["new"] = data_ns - expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]") - tm.assert_frame_equal(result, expected) - # OutOfBoundsDatetime error shouldn't occur - data_s = np.array([1, "nat"], dtype="datetime64[s]") - result["new"] = data_s - expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]") - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py deleted file mode 100644 index 1271a490d6b70..0000000000000 --- a/pandas/tests/frame/test_timezones.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Tests for DataFrame timezone-related methods -""" -import numpy as np -import pytest - -from pandas.core.dtypes.dtypes import DatetimeTZDtype - -from pandas import DataFrame, Series -import pandas._testing as tm -from pandas.core.indexes.datetimes import date_range - - -class TestDataFrameTimezones: - @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) - def test_frame_no_datetime64_dtype(self, tz): - # after GH#7822 - # these retain the timezones on dict construction - dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") - dr_tz = dr.tz_localize(tz) - df = DataFrame({"A": "foo", "B": dr_tz}, index=dr) - tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo) - assert df["B"].dtype == tz_expected - - # GH#2810 (with timezones) - datetimes_naive = [ts.to_pydatetime() for ts in dr] - datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] - df = DataFrame({"dr": dr}) - df["dr_tz"] = dr_tz - df["datetimes_naive"] = datetimes_naive - df["datetimes_with_tz"] = datetimes_with_tz - result = df.dtypes - expected = Series( - [ - np.dtype("datetime64[ns]"), - DatetimeTZDtype(tz=tz), - np.dtype("datetime64[ns]"), - DatetimeTZDtype(tz=tz), - ], - index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], - ) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) - def test_frame_reset_index(self, tz): - dr = date_range("2012-06-02", periods=10, tz=tz) - df = DataFrame(np.random.randn(len(dr)), dr) - roundtripped = df.reset_index().set_index("index") - xp = df.index.tz - rs = roundtripped.index.tz - assert xp == rs - - @pytest.mark.parametrize("tz", [None, "America/New_York"]) - def test_boolean_compare_transpose_tzindex_with_dst(self, tz): - # GH 19970 - idx = date_range("20161101", "20161130", freq="4H", tz=tz) - df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) - result = df.T == df.T - expected = DataFrame(True, index=list("ab"), columns=idx) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/generic/methods/test_first_valid_index.py b/pandas/tests/generic/methods/test_first_valid_index.py index bca3452c3c458..8d021f0e3954e 100644 --- a/pandas/tests/generic/methods/test_first_valid_index.py +++ b/pandas/tests/generic/methods/test_first_valid_index.py @@ -9,10 +9,9 @@ class TestFirstValidIndex: - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_first_valid_index_single_nan(self, klass): + def test_first_valid_index_single_nan(self, frame_or_series): # GH#9752 Series/DataFrame should both return None, not raise - obj = klass([np.nan]) + obj = frame_or_series([np.nan]) assert obj.first_valid_index() is None assert obj.iloc[:0].first_valid_index() is None diff --git a/pandas/tests/generic/methods/test_pipe.py b/pandas/tests/generic/methods/test_pipe.py index 59e5edc4b8bb5..b378600634bf0 100644 --- a/pandas/tests/generic/methods/test_pipe.py +++ b/pandas/tests/generic/methods/test_pipe.py @@ -5,11 +5,10 @@ class TestPipe: - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_pipe(self, klass): + def test_pipe(self, frame_or_series): obj = DataFrame({"A": [1, 2, 3]}) expected = DataFrame({"A": [1, 4, 9]}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] expected = expected["A"] @@ -17,20 +16,18 @@ def test_pipe(self, klass): result = obj.pipe(f, 2) tm.assert_equal(result, expected) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_pipe_tuple(self, klass): + def test_pipe_tuple(self, frame_or_series): obj = DataFrame({"A": [1, 2, 3]}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] f = lambda x, y: y result = obj.pipe((f, "y"), 0) tm.assert_equal(result, obj) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_pipe_tuple_error(self, klass): + def test_pipe_tuple_error(self, frame_or_series): obj = DataFrame({"A": [1, 2, 3]}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] f = lambda x, y: y diff --git a/pandas/tests/generic/methods/test_reorder_levels.py b/pandas/tests/generic/methods/test_reorder_levels.py index 8bb6417e56659..6bfbf089a6108 100644 --- a/pandas/tests/generic/methods/test_reorder_levels.py +++ b/pandas/tests/generic/methods/test_reorder_levels.py @@ -1,20 +1,19 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, MultiIndex import pandas._testing as tm class TestReorderLevels: - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_reorder_levels(self, klass): + def test_reorder_levels(self, frame_or_series): index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], names=["L0", "L1", "L2"], ) df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) - obj = df if klass is DataFrame else df["A"] + obj = df if frame_or_series is DataFrame else df["A"] # no change, position result = obj.reorder_levels([0, 1, 2]) @@ -32,7 +31,7 @@ def test_reorder_levels(self, klass): names=["L1", "L2", "L0"], ) expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) - expected = expected if klass is DataFrame else expected["A"] + expected = expected if frame_or_series is DataFrame else expected["A"] tm.assert_equal(result, expected) result = obj.reorder_levels([0, 0, 0]) @@ -42,7 +41,7 @@ def test_reorder_levels(self, klass): names=["L0", "L0", "L0"], ) expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) - expected = expected if klass is DataFrame else expected["A"] + expected = expected if frame_or_series is DataFrame else expected["A"] tm.assert_equal(result, expected) result = obj.reorder_levels(["L0", "L0", "L0"]) diff --git a/pandas/tests/generic/methods/test_sample.py b/pandas/tests/generic/methods/test_sample.py index 7303dad9170ed..b26a3785f918d 100644 --- a/pandas/tests/generic/methods/test_sample.py +++ b/pandas/tests/generic/methods/test_sample.py @@ -155,22 +155,20 @@ def test_sample_none_weights(self, obj): ), ], ) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_sample_random_state(self, func_str, arg, klass): + def test_sample_random_state(self, func_str, arg, frame_or_series): # GH#32503 obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) - if klass is Series: + if frame_or_series is Series: obj = obj["col1"] result = obj.sample(n=3, random_state=eval(func_str)(arg)) expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) tm.assert_equal(result, expected) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_sample_upsampling_without_replacement(self, klass): + def test_sample_upsampling_without_replacement(self, frame_or_series): # GH#27451 obj = DataFrame({"A": list("abc")}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] msg = ( diff --git a/pandas/tests/generic/methods/test_set_axis.py b/pandas/tests/generic/methods/test_set_axis.py index 278d43ef93d2f..a46a91811f40e 100644 --- a/pandas/tests/generic/methods/test_set_axis.py +++ b/pandas/tests/generic/methods/test_set_axis.py @@ -57,6 +57,28 @@ def test_set_axis_invalid_axis_name(self, axis, obj): with pytest.raises(ValueError, match="No axis named"): obj.set_axis(list("abc"), axis=axis) + def test_set_axis_setattr_index_not_collection(self, obj): + # wrong type + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed" + ) + with pytest.raises(TypeError, match=msg): + obj.index = None + + def test_set_axis_setattr_index_wrong_length(self, obj): + # wrong length + msg = ( + f"Length mismatch: Expected axis has {len(obj)} elements, " + f"new values have {len(obj)-1} elements" + ) + with pytest.raises(ValueError, match=msg): + obj.index = np.arange(len(obj) - 1) + + if obj.ndim == 2: + with pytest.raises(ValueError, match="Length mismatch"): + obj.columns = obj.columns[::2] + class TestDataFrameSetAxis(SharedSetAxisTests): @pytest.fixture diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index d7aadda990f53..e38936baca758 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -178,20 +178,14 @@ marks=not_implemented_mark, ), pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("diff")), - marks=not_implemented_mark, - ), - pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("applymap", lambda x: x)), - marks=not_implemented_mark, + (pd.DataFrame, frame_data, operator.methodcaller("applymap", lambda x: x)) ), pytest.param( ( pd.DataFrame, frame_data, operator.methodcaller("append", pd.DataFrame({"A": [1]})), - ), - marks=not_implemented_mark, + ) ), pytest.param( ( @@ -199,7 +193,6 @@ frame_data, operator.methodcaller("append", pd.DataFrame({"B": [1]})), ), - marks=not_implemented_mark, ), pytest.param( ( @@ -424,8 +417,6 @@ (pd.DataFrame, frame_data, operator.methodcaller("where", np.array([[True]]))), (pd.Series, ([1, 2],), operator.methodcaller("mask", np.array([True, False]))), (pd.DataFrame, frame_data, operator.methodcaller("mask", np.array([[True]]))), - (pd.Series, ([1, 2],), operator.methodcaller("slice_shift")), - (pd.DataFrame, frame_data, operator.methodcaller("slice_shift")), pytest.param( ( pd.Series, diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 45601abc95fe6..7fde448bb36dc 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -5,8 +5,7 @@ from pandas.core.dtypes.common import is_scalar -import pandas as pd -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, Series import pandas._testing as tm # ---------------------------------------------------------------------- @@ -248,31 +247,6 @@ def test_metadata_propagation(self): self.check_metadata(v1 & v2) self.check_metadata(v1 | v2) - def test_head_tail(self, index): - # GH5370 - - o = self._construct(shape=len(index)) - - axis = o._get_axis_name(0) - setattr(o, axis, index) - - o.head() - - self._compare(o.head(), o.iloc[:5]) - self._compare(o.tail(), o.iloc[-5:]) - - # 0-len - self._compare(o.head(0), o.iloc[0:0]) - self._compare(o.tail(0), o.iloc[0:0]) - - # bounded - self._compare(o.head(len(o) + 1), o) - self._compare(o.tail(len(o) + 1), o) - - # neg index - self._compare(o.head(-3), o.head(len(index) - 3)) - self._compare(o.tail(-3), o.tail(len(index) - 3)) - def test_size_compat(self): # GH8846 # size property should be defined @@ -460,77 +434,23 @@ def test_take_invalid_kwargs(self): obj.take(indices, mode="clip") @pytest.mark.parametrize("is_copy", [True, False]) - def test_depr_take_kwarg_is_copy(self, is_copy): + def test_depr_take_kwarg_is_copy(self, is_copy, frame_or_series): # GH 27357 - df = DataFrame({"A": [1, 2, 3]}) + obj = DataFrame({"A": [1, 2, 3]}) + if frame_or_series is Series: + obj = obj["A"] + msg = ( "is_copy is deprecated and will be removed in a future version. " "'take' always returns a copy, so there is no need to specify this." ) with tm.assert_produces_warning(FutureWarning) as w: - df.take([0, 1], is_copy=is_copy) + obj.take([0, 1], is_copy=is_copy) assert w[0].message.args[0] == msg - s = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - s.take([0, 1], is_copy=is_copy) - - def test_equals(self): - # Add object dtype column with nans - index = np.random.random(10) - df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) - df1["text"] = "the sky is so blue. we could use more chocolate.".split() - df1["start"] = date_range("2000-1-1", periods=10, freq="T") - df1["end"] = date_range("2000-1-1", periods=10, freq="D") - df1["diff"] = df1["end"] - df1["start"] - df1["bool"] = np.arange(10) % 3 == 0 - df1.loc[::2] = np.nan - df2 = df1.copy() - assert df1["text"].equals(df2["text"]) - assert df1["start"].equals(df2["start"]) - assert df1["end"].equals(df2["end"]) - assert df1["diff"].equals(df2["diff"]) - assert df1["bool"].equals(df2["bool"]) - assert df1.equals(df2) - assert not df1.equals(object) - - # different dtype - different = df1.copy() - different["floats"] = different["floats"].astype("float32") - assert not df1.equals(different) - - # different index - different_index = -index - different = df2.set_index(different_index) - assert not df1.equals(different) - - # different columns - different = df2.copy() - different.columns = df2.columns[::-1] - assert not df1.equals(different) - - # DatetimeIndex - index = pd.date_range("2000-1-1", periods=10, freq="T") - df1 = df1.set_index(index) - df2 = df1.copy() - assert df1.equals(df2) - - # MultiIndex - df3 = df1.set_index(["text"], append=True) - df2 = df1.set_index(["text"], append=True) - assert df3.equals(df2) - - df2 = df1.set_index(["floats"], append=True) - assert not df3.equals(df2) - - # NaN in index - df3 = df1.set_index(["floats"], append=True) - df2 = df1.set_index(["floats"], append=True) - assert df3.equals(df2) - - @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) - def test_axis_classmethods(self, box): + def test_axis_classmethods(self, frame_or_series): + box = frame_or_series obj = box(dtype=object) values = box._AXIS_TO_AXIS_NUMBER.keys() for v in values: @@ -538,26 +458,36 @@ def test_axis_classmethods(self, box): assert obj._get_axis_name(v) == box._get_axis_name(v) assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v) - @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) - def test_axis_names_deprecated(self, box): + def test_axis_names_deprecated(self, frame_or_series): # GH33637 + box = frame_or_series obj = box(dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NAMES - @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) - def test_axis_numbers_deprecated(self, box): + def test_axis_numbers_deprecated(self, frame_or_series): # GH33637 + box = frame_or_series obj = box(dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NUMBERS - @pytest.mark.parametrize("as_frame", [True, False]) - def test_flags_identity(self, as_frame): + def test_flags_identity(self, frame_or_series): s = Series([1, 2]) - if as_frame: + if frame_or_series is DataFrame: s = s.to_frame() assert s.flags is s.flags s2 = s.copy() assert s2.flags is not s.flags + + def test_slice_shift_deprecated(self): + # GH 37601 + df = DataFrame({"A": [1, 2, 3, 4]}) + s = Series([1, 2, 3, 4]) + + with tm.assert_produces_warning(FutureWarning): + df["A"].slice_shift() + + with tm.assert_produces_warning(FutureWarning): + s.slice_shift() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2563eeeb68672..a0c228200e73a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1298,6 +1298,13 @@ def test_groupby_nat_exclude(): grouped.get_group(pd.NaT) +def test_groupby_two_group_keys_all_nan(): + # GH #36842: Grouping over two group keys shouldn't raise an error + df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]}) + result = df.groupby(["a", "b"]).indices + assert result == {} + + def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 29a8f883f0ff5..e38fa5e8de87e 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -335,3 +335,21 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, expected = pd.DataFrame(selected_data, index=mi) tm.assert_frame_equal(result, expected) + + +def test_groupby_nan_included(): + # GH 35646 + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + df = pd.DataFrame(data) + grouped = df.groupby("group", dropna=False) + result = grouped.indices + dtype = np.intp + expected = { + "g1": np.array([0, 2], dtype=dtype), + "g2": np.array([3], dtype=dtype), + np.nan: np.array([1, 4], dtype=dtype), + } + for result_values, expected_values in zip(result.values(), expected.values()): + tm.assert_numpy_array_equal(result_values, expected_values) + assert np.isnan(list(result.keys())[2]) + assert list(result.keys())[0:2] == ["g1", "g2"] diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index cc7a79e976513..d268d87708552 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -51,6 +51,15 @@ def test_groupby_preserves_subclass(obj, groupby_func): tm.assert_series_equal(result1, result2) +def test_groupby_preserves_metadata(): + # GH-37343 + custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]}) + assert "testattr" in custom_df._metadata + custom_df.testattr = "hello" + for _, group_df in custom_df.groupby("c"): + assert group_df.testattr == "hello" + + @pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame]) def test_groupby_resample_preserves_subclass(obj): # GH28330 -- preserve subclass through groupby.resample() diff --git a/pandas/tests/indexes/base_class/test_where.py b/pandas/tests/indexes/base_class/test_where.py new file mode 100644 index 0000000000000..0c8969735e14e --- /dev/null +++ b/pandas/tests/indexes/base_class/test_where.py @@ -0,0 +1,13 @@ +import numpy as np + +from pandas import Index +import pandas._testing as tm + + +class TestWhere: + def test_where_intlike_str_doesnt_cast_ints(self): + idx = Index(range(3)) + mask = np.array([True, False, True]) + res = idx.where(mask, "2") + expected = Index([0, "2", 2]) + tm.assert_index_equal(res, expected) diff --git a/pandas/tests/indexes/categorical/test_fillna.py b/pandas/tests/indexes/categorical/test_fillna.py index f6a6747166011..c8fc55c29054e 100644 --- a/pandas/tests/indexes/categorical/test_fillna.py +++ b/pandas/tests/indexes/categorical/test_fillna.py @@ -14,7 +14,7 @@ def test_fillna_categorical(self): tm.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError - msg = "'fill_value=2.0' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): idx.fillna(2.0) @@ -36,7 +36,7 @@ def test_fillna_validates_with_no_nas(self): ci = CategoricalIndex([2, 3, 3]) cat = ci._data - msg = "'fill_value=False' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): ci.fillna(False) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index be8ca61f1a730..6f078237e3a97 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -143,10 +143,9 @@ def test_where_cast_str(self): result = index.where(mask, [str(index[0])]) tm.assert_index_equal(result, expected) - msg = "Where requires matching dtype, not foo" + msg = "value should be a '.*', 'NaT', or array of those" with pytest.raises(TypeError, match=msg): index.where(mask, "foo") - msg = r"Where requires matching dtype, not \['foo'\]" with pytest.raises(TypeError, match=msg): index.where(mask, ["foo"]) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 4e46eb126894b..59269b9b54ddc 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -177,24 +177,26 @@ def test_where_invalid_dtypes(self): i2 = Index([pd.NaT, pd.NaT] + dti[2:].tolist()) - with pytest.raises(TypeError, match="Where requires matching dtype"): + msg = "value should be a 'Timestamp', 'NaT', or array of those. Got" + msg2 = "Cannot compare tz-naive and tz-aware datetime-like objects" + with pytest.raises(TypeError, match=msg2): # passing tz-naive ndarray to tzaware DTI dti.where(notna(i2), i2.values) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg2): # passing tz-aware DTI to tznaive DTI dti.tz_localize(None).where(notna(i2), i2) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): dti.where(notna(i2), i2.tz_localize(None).to_period("D")) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): dti.where(notna(i2), i2.asi8.view("timedelta64[ns]")) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): dti.where(notna(i2), i2.asi8) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): # non-matching scalar dti.where(notna(i2), pd.Timedelta(days=4)) @@ -203,7 +205,7 @@ def test_where_mismatched_nat(self, tz_aware_fixture): dti = pd.date_range("2013-01-01", periods=3, tz=tz) cond = np.array([True, False, True]) - msg = "Where requires matching dtype" + msg = "value should be a 'Timestamp', 'NaT', or array of those. Got" with pytest.raises(TypeError, match=msg): # wrong-dtyped NaT dti.where(cond, np.timedelta64("NaT", "ns")) @@ -542,6 +544,13 @@ def test_contains_nonunique(self, vals): class TestGetIndexer: + def test_get_indexer_date_objs(self): + rng = date_range("1/1/2000", periods=20) + + result = rng.get_indexer(rng.map(lambda x: x.date())) + expected = rng.get_indexer(rng) + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer(self): idx = pd.date_range("2000-01-01", periods=3) exp = np.array([0, 1, 2], dtype=np.intp) @@ -719,3 +728,12 @@ def test_slice_datetime_locs(self, box, kind, tz_aware_fixture): result = index.slice_locs(key, box(2010, 1, 2)) expected = (0, 1) assert result == expected + + +class TestIndexerBetweenTime: + def test_indexer_between_time(self): + # GH#11818 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" + with pytest.raises(ValueError, match=msg): + rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 67e031b53e44e..157446b1fff5d 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -579,9 +579,11 @@ def test_comparison(self): actual = self.index == self.index.left tm.assert_numpy_array_equal(actual, np.array([False, False])) - msg = ( - "not supported between instances of 'int' and " - "'pandas._libs.interval.Interval'" + msg = "|".join( + [ + "not supported between instances of 'int' and '.*.Interval'", + r"Invalid comparison between dtype=interval\[int64\] and ", + ] ) with pytest.raises(TypeError, match=msg): self.index > 0 diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index b6d3c36f1682c..19dfa9137cc5c 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -545,16 +545,17 @@ def test_where_invalid_dtypes(self): i2 = PeriodIndex([NaT, NaT] + pi[2:].tolist(), freq="D") - with pytest.raises(TypeError, match="Where requires matching dtype"): + msg = "value should be a 'Period', 'NaT', or array of those" + with pytest.raises(TypeError, match=msg): pi.where(notna(i2), i2.asi8) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): pi.where(notna(i2), i2.asi8.view("timedelta64[ns]")) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): pi.where(notna(i2), i2.to_timestamp("S")) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): # non-matching scalar pi.where(notna(i2), Timedelta(days=4)) @@ -562,7 +563,7 @@ def test_where_mismatched_nat(self): pi = period_range("20130101", periods=5, freq="D") cond = np.array([True, False, True, True, False]) - msg = "Where requires matching dtype" + msg = "value should be a 'Period', 'NaT', or array of those" with pytest.raises(TypeError, match=msg): # wrong-dtyped NaT pi.where(cond, np.timedelta64("NaT", "ns")) diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 396a676b97a1b..37aa9653550fb 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -150,16 +150,17 @@ def test_where_invalid_dtypes(self): i2 = Index([pd.NaT, pd.NaT] + tdi[2:].tolist()) - with pytest.raises(TypeError, match="Where requires matching dtype"): + msg = "value should be a 'Timedelta', 'NaT', or array of those" + with pytest.raises(TypeError, match=msg): tdi.where(notna(i2), i2.asi8) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): tdi.where(notna(i2), i2 + pd.Timestamp.now()) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): tdi.where(notna(i2), (i2 + pd.Timestamp.now()).to_period("D")) - with pytest.raises(TypeError, match="Where requires matching dtype"): + with pytest.raises(TypeError, match=msg): # non-matching scalar tdi.where(notna(i2), pd.Timestamp.now()) @@ -167,7 +168,7 @@ def test_where_mismatched_nat(self): tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") cond = np.array([True, False, False]) - msg = "Where requires matching dtype" + msg = "value should be a 'Timedelta', 'NaT', or array of those" with pytest.raises(TypeError, match=msg): # wrong-dtyped NaT tdi.where(cond, np.datetime64("NaT", "ns")) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py deleted file mode 100644 index b98c9a3df0438..0000000000000 --- a/pandas/tests/indexing/test_callable.py +++ /dev/null @@ -1,254 +0,0 @@ -import numpy as np - -import pandas as pd -import pandas._testing as tm - - -class TestIndexingCallable: - def test_frame_loc_callable(self): - # GH 11485 - df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) - # iloc cannot use boolean Series (see GH3635) - - # return bool indexer - res = df.loc[lambda x: x.A > 2] - tm.assert_frame_equal(res, df.loc[df.A > 2]) - - res = df.loc[lambda x: x.A > 2] - tm.assert_frame_equal(res, df.loc[df.A > 2]) - - res = df.loc[lambda x: x.A > 2] - tm.assert_frame_equal(res, df.loc[df.A > 2]) - - res = df.loc[lambda x: x.A > 2] - tm.assert_frame_equal(res, df.loc[df.A > 2]) - - res = df.loc[lambda x: x.B == "b", :] - tm.assert_frame_equal(res, df.loc[df.B == "b", :]) - - res = df.loc[lambda x: x.B == "b", :] - tm.assert_frame_equal(res, df.loc[df.B == "b", :]) - - res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] - tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) - - res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] - tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) - - res = df.loc[lambda x: x.A > 2, lambda x: "B"] - tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) - - res = df.loc[lambda x: x.A > 2, lambda x: "B"] - tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) - - res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] - tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - - res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] - tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - - res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] - tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) - - res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] - tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) - - # scalar - res = df.loc[lambda x: 1, lambda x: "A"] - assert res == df.loc[1, "A"] - - res = df.loc[lambda x: 1, lambda x: "A"] - assert res == df.loc[1, "A"] - - def test_frame_loc_callable_mixture(self): - # GH 11485 - df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) - - res = df.loc[lambda x: x.A > 2, ["A", "B"]] - tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - - res = df.loc[lambda x: x.A > 2, ["A", "B"]] - tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - - res = df.loc[[2, 3], lambda x: ["A", "B"]] - tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) - - res = df.loc[[2, 3], lambda x: ["A", "B"]] - tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) - - res = df.loc[3, lambda x: ["A", "B"]] - tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) - - res = df.loc[3, lambda x: ["A", "B"]] - tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) - - def test_frame_loc_callable_labels(self): - # GH 11485 - df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) - - # return label - res = df.loc[lambda x: ["A", "C"]] - tm.assert_frame_equal(res, df.loc[["A", "C"]]) - - res = df.loc[lambda x: ["A", "C"]] - tm.assert_frame_equal(res, df.loc[["A", "C"]]) - - res = df.loc[lambda x: ["A", "C"], :] - tm.assert_frame_equal(res, df.loc[["A", "C"], :]) - - res = df.loc[lambda x: ["A", "C"], lambda x: "X"] - tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - - res = df.loc[lambda x: ["A", "C"], lambda x: ["X"]] - tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) - - # mixture - res = df.loc[["A", "C"], lambda x: "X"] - tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - - res = df.loc[["A", "C"], lambda x: ["X"]] - tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) - - res = df.loc[lambda x: ["A", "C"], "X"] - tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - - res = df.loc[lambda x: ["A", "C"], ["X"]] - tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) - - def test_frame_loc_callable_setitem(self): - # GH 11485 - df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) - - # return label - res = df.copy() - res.loc[lambda x: ["A", "C"]] = -20 - exp = df.copy() - exp.loc[["A", "C"]] = -20 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.loc[lambda x: ["A", "C"], :] = 20 - exp = df.copy() - exp.loc[["A", "C"], :] = 20 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.loc[lambda x: ["A", "C"], lambda x: "X"] = -1 - exp = df.copy() - exp.loc[["A", "C"], "X"] = -1 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.loc[lambda x: ["A", "C"], lambda x: ["X"]] = [5, 10] - exp = df.copy() - exp.loc[["A", "C"], ["X"]] = [5, 10] - tm.assert_frame_equal(res, exp) - - # mixture - res = df.copy() - res.loc[["A", "C"], lambda x: "X"] = np.array([-1, -2]) - exp = df.copy() - exp.loc[["A", "C"], "X"] = np.array([-1, -2]) - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.loc[["A", "C"], lambda x: ["X"]] = 10 - exp = df.copy() - exp.loc[["A", "C"], ["X"]] = 10 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.loc[lambda x: ["A", "C"], "X"] = -2 - exp = df.copy() - exp.loc[["A", "C"], "X"] = -2 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.loc[lambda x: ["A", "C"], ["X"]] = -4 - exp = df.copy() - exp.loc[["A", "C"], ["X"]] = -4 - tm.assert_frame_equal(res, exp) - - def test_frame_iloc_callable(self): - # GH 11485 - df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) - - # return location - res = df.iloc[lambda x: [1, 3]] - tm.assert_frame_equal(res, df.iloc[[1, 3]]) - - res = df.iloc[lambda x: [1, 3], :] - tm.assert_frame_equal(res, df.iloc[[1, 3], :]) - - res = df.iloc[lambda x: [1, 3], lambda x: 0] - tm.assert_series_equal(res, df.iloc[[1, 3], 0]) - - res = df.iloc[lambda x: [1, 3], lambda x: [0]] - tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) - - # mixture - res = df.iloc[[1, 3], lambda x: 0] - tm.assert_series_equal(res, df.iloc[[1, 3], 0]) - - res = df.iloc[[1, 3], lambda x: [0]] - tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) - - res = df.iloc[lambda x: [1, 3], 0] - tm.assert_series_equal(res, df.iloc[[1, 3], 0]) - - res = df.iloc[lambda x: [1, 3], [0]] - tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) - - def test_frame_iloc_callable_setitem(self): - # GH 11485 - df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) - - # return location - res = df.copy() - res.iloc[lambda x: [1, 3]] = 0 - exp = df.copy() - exp.iloc[[1, 3]] = 0 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.iloc[lambda x: [1, 3], :] = -1 - exp = df.copy() - exp.iloc[[1, 3], :] = -1 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.iloc[lambda x: [1, 3], lambda x: 0] = 5 - exp = df.copy() - exp.iloc[[1, 3], 0] = 5 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.iloc[lambda x: [1, 3], lambda x: [0]] = 25 - exp = df.copy() - exp.iloc[[1, 3], [0]] = 25 - tm.assert_frame_equal(res, exp) - - # mixture - res = df.copy() - res.iloc[[1, 3], lambda x: 0] = -3 - exp = df.copy() - exp.iloc[[1, 3], 0] = -3 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.iloc[[1, 3], lambda x: [0]] = -5 - exp = df.copy() - exp.iloc[[1, 3], [0]] = -5 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.iloc[lambda x: [1, 3], 0] = 10 - exp = df.copy() - exp.iloc[[1, 3], 0] = 10 - tm.assert_frame_equal(res, exp) - - res = df.copy() - res.iloc[lambda x: [1, 3], [0]] = [-5, -5] - exp = df.copy() - exp.iloc[[1, 3], [0]] = [-5, -5] - tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 436b2aa838b08..fd6f6fbc6a4ba 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -780,7 +780,7 @@ def test_where_index_timedelta64(self, value): result = tdi.where(cond, value) tm.assert_index_equal(result, expected) - msg = "Where requires matching dtype" + msg = "value should be a 'Timedelta', 'NaT', or array of thos" with pytest.raises(TypeError, match=msg): # wrong-dtyped NaT tdi.where(cond, np.datetime64("NaT", "ns")) @@ -804,11 +804,12 @@ def test_where_index_period(self): tm.assert_index_equal(result, expected) # Passing a mismatched scalar - msg = "Where requires matching dtype" + msg = "value should be a 'Period', 'NaT', or array of those" with pytest.raises(TypeError, match=msg): pi.where(cond, pd.Timedelta(days=4)) - with pytest.raises(TypeError, match=msg): + msg = r"Input has different freq=D from PeriodArray\(freq=Q-DEC\)" + with pytest.raises(ValueError, match=msg): pi.where(cond, pd.Period("2020-04-21", "D")) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 867941a97b598..6c80354610a78 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -778,6 +778,92 @@ def test_iloc_setitem_series_duplicate_columns(self): assert df.dtypes.iloc[2] == np.int64 +class TestILocCallable: + def test_frame_iloc_getitem_callable(self): + # GH#11485 + df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return location + res = df.iloc[lambda x: [1, 3]] + tm.assert_frame_equal(res, df.iloc[[1, 3]]) + + res = df.iloc[lambda x: [1, 3], :] + tm.assert_frame_equal(res, df.iloc[[1, 3], :]) + + res = df.iloc[lambda x: [1, 3], lambda x: 0] + tm.assert_series_equal(res, df.iloc[[1, 3], 0]) + + res = df.iloc[lambda x: [1, 3], lambda x: [0]] + tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) + + # mixture + res = df.iloc[[1, 3], lambda x: 0] + tm.assert_series_equal(res, df.iloc[[1, 3], 0]) + + res = df.iloc[[1, 3], lambda x: [0]] + tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) + + res = df.iloc[lambda x: [1, 3], 0] + tm.assert_series_equal(res, df.iloc[[1, 3], 0]) + + res = df.iloc[lambda x: [1, 3], [0]] + tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) + + def test_frame_iloc_setitem_callable(self): + # GH#11485 + df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return location + res = df.copy() + res.iloc[lambda x: [1, 3]] = 0 + exp = df.copy() + exp.iloc[[1, 3]] = 0 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], :] = -1 + exp = df.copy() + exp.iloc[[1, 3], :] = -1 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], lambda x: 0] = 5 + exp = df.copy() + exp.iloc[[1, 3], 0] = 5 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], lambda x: [0]] = 25 + exp = df.copy() + exp.iloc[[1, 3], [0]] = 25 + tm.assert_frame_equal(res, exp) + + # mixture + res = df.copy() + res.iloc[[1, 3], lambda x: 0] = -3 + exp = df.copy() + exp.iloc[[1, 3], 0] = -3 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[[1, 3], lambda x: [0]] = -5 + exp = df.copy() + exp.iloc[[1, 3], [0]] = -5 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], 0] = 10 + exp = df.copy() + exp.iloc[[1, 3], 0] = 10 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], [0]] = [-5, -5] + exp = df.copy() + exp.iloc[[1, 3], [0]] = [-5, -5] + tm.assert_frame_equal(res, exp) + + class TestILocSeries: def test_iloc(self): ser = Series(np.random.randn(10), index=list(range(0, 20, 2))) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index de70ff37a052a..614e424e8aca2 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -116,10 +116,6 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) - if (len(index) == 0) and (idxr_id == "iloc") and isinstance(obj, pd.DataFrame): - # gh-32896 - pytest.skip("This is currently failing. There's an xfailed test below.") - if idxr_id == "iloc": err = ValueError msg = f"Cannot set values with ndim > {obj.ndim}" @@ -140,7 +136,6 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): idxr[nd3] = 0 def test_setitem_ndarray_3d_does_not_fail_for_iloc_empty_dataframe(self): - # when fixing this, please remove the pytest.skip in test_setitem_ndarray_3d i = Index([]) obj = DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i) nd3 = np.random.randint(5, size=(2, 2, 2)) diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py deleted file mode 100644 index 2ffa44bec14a6..0000000000000 --- a/pandas/tests/indexing/test_indexing_slow.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from pandas import DataFrame -import pandas._testing as tm - - -class TestIndexingSlow: - @pytest.mark.slow - def test_large_dataframe_indexing(self): - # GH10692 - result = DataFrame({"x": range(10 ** 6)}, dtype="int64") - result.loc[len(result)] = len(result) + 1 - expected = DataFrame({"x": range(10 ** 6 + 1)}, dtype="int64") - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b1c66c3c8850a..c1a5db992d3df 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,4 +1,5 @@ """ test label based indexing with loc """ +from datetime import time from io import StringIO import re @@ -992,6 +993,27 @@ def test_loc_setitem_str_to_small_float_conversion_type(self): expected = DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected) + def test_loc_getitem_time_object(self, frame_or_series): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + mask = (rng.hour == 9) & (rng.minute == 30) + + obj = DataFrame(np.random.randn(len(rng), 3), index=rng) + if frame_or_series is Series: + obj = obj[0] + + result = obj.loc[time(9, 30)] + exp = obj.loc[mask] + tm.assert_equal(result, exp) + + chunk = obj.loc["1/4/2000":] + result = chunk.loc[time(9, 30)] + expected = result[-1:] + + # Without resetting the freqs, these are 5 min and 1440 min, respectively + result.index = result.index._with_freq(None) + expected.index = expected.index._with_freq(None) + tm.assert_equal(result, expected) + class TestLocWithMultiIndex: @pytest.mark.parametrize( @@ -1082,6 +1104,182 @@ def test_loc_setitem_multiindex_slice(self): tm.assert_series_equal(result, expected) +class TestLocSetitemWithExpansion: + @pytest.mark.slow + def test_loc_setitem_with_expansion_large_dataframe(self): + # GH#10692 + result = DataFrame({"x": range(10 ** 6)}, dtype="int64") + result.loc[len(result)] = len(result) + 1 + expected = DataFrame({"x": range(10 ** 6 + 1)}, dtype="int64") + tm.assert_frame_equal(result, expected) + + +class TestLocCallable: + def test_frame_loc_getitem_callable(self): + # GH#11485 + df = DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) + # iloc cannot use boolean Series (see GH3635) + + # return bool indexer + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) + + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) + + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) + + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) + + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) + + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) + + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] + tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) + + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] + tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) + + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) + + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) + + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) + + # scalar + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] + + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] + + def test_frame_loc_getitem_callable_mixture(self): + # GH#11485 + df = DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) + + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) + + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) + + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) + + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) + + def test_frame_loc_getitem_callable_labels(self): + # GH#11485 + df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return label + res = df.loc[lambda x: ["A", "C"]] + tm.assert_frame_equal(res, df.loc[["A", "C"]]) + + res = df.loc[lambda x: ["A", "C"]] + tm.assert_frame_equal(res, df.loc[["A", "C"]]) + + res = df.loc[lambda x: ["A", "C"], :] + tm.assert_frame_equal(res, df.loc[["A", "C"], :]) + + res = df.loc[lambda x: ["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) + + res = df.loc[lambda x: ["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) + + # mixture + res = df.loc[["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) + + res = df.loc[["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) + + res = df.loc[lambda x: ["A", "C"], "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) + + res = df.loc[lambda x: ["A", "C"], ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) + + def test_frame_loc_setitem_callable(self): + # GH#11485 + df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return label + res = df.copy() + res.loc[lambda x: ["A", "C"]] = -20 + exp = df.copy() + exp.loc[["A", "C"]] = -20 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], :] = 20 + exp = df.copy() + exp.loc[["A", "C"], :] = 20 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], lambda x: "X"] = -1 + exp = df.copy() + exp.loc[["A", "C"], "X"] = -1 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], lambda x: ["X"]] = [5, 10] + exp = df.copy() + exp.loc[["A", "C"], ["X"]] = [5, 10] + tm.assert_frame_equal(res, exp) + + # mixture + res = df.copy() + res.loc[["A", "C"], lambda x: "X"] = np.array([-1, -2]) + exp = df.copy() + exp.loc[["A", "C"], "X"] = np.array([-1, -2]) + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[["A", "C"], lambda x: ["X"]] = 10 + exp = df.copy() + exp.loc[["A", "C"], ["X"]] = 10 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], "X"] = -2 + exp = df.copy() + exp.loc[["A", "C"], "X"] = -2 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], ["X"]] = -4 + exp = df.copy() + exp.loc[["A", "C"], ["X"]] = -4 + tm.assert_frame_equal(res, exp) + + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 key = np.array( diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 6e35b224ef4c3..dba4b9214e50c 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -676,6 +676,11 @@ class TestTableOrientReader: {"floats": [1.0, 2.0, 3.0, 4.0]}, {"floats": [1.1, 2.2, 3.3, 4.4]}, {"bools": [True, False, False, True]}, + { + "timezones": pd.date_range( + "2016-01-01", freq="d", periods=4, tz="US/Central" + ) # added in # GH 35973 + }, ], ) @pytest.mark.skipif(sys.version_info[:3] == (3, 7, 0), reason="GH-35309") @@ -686,22 +691,59 @@ def test_read_json_table_orient(self, index_nm, vals, recwarn): tm.assert_frame_equal(df, result) @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) + @pytest.mark.parametrize( + "vals", + [{"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")}], + ) + def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): + df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) + out = df.to_json(orient="table") + with pytest.raises(NotImplementedError, match="can not yet read "): + pd.read_json(out, orient="table") + + @pytest.mark.parametrize( + "idx", + [ + pd.Index(range(4)), + pd.Index( + pd.date_range( + "2020-08-30", + freq="d", + periods=4, + ), + freq=None, + ), + pd.Index( + pd.date_range("2020-08-30", freq="d", periods=4, tz="US/Central"), + freq=None, + ), + pd.MultiIndex.from_product( + [ + pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"), + ["x", "y"], + ], + ), + ], + ) @pytest.mark.parametrize( "vals", [ - {"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")}, + {"floats": [1.1, 2.2, 3.3, 4.4]}, + {"dates": pd.date_range("2020-08-30", freq="d", periods=4)}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2020-08-30", freq="d", periods=4, tz="Europe/London" ) }, ], ) - def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): - df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) + @pytest.mark.skipif(sys.version_info[:3] == (3, 7, 0), reason="GH-35309") + def test_read_json_table_timezones_orient(self, idx, vals, recwarn): + # GH 35973 + df = DataFrame(vals, index=idx) out = df.to_json(orient="table") - with pytest.raises(NotImplementedError, match="can not yet read "): - pd.read_json(out, orient="table") + result = pd.read_json(out, orient="table") + tm.assert_frame_equal(df, result) def test_comprehensive(self): df = DataFrame( diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 933bdc462e3f8..2e68d3306c7d1 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -143,7 +143,7 @@ def test_readjson_chunks_closes(chunksize): ) reader.read() assert ( - reader.open_stream.closed + reader.handles.handle.closed ), f"didn't close stream with chunksize = {chunksize}" diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b33289213e258..8f63d06859f62 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -6,7 +6,7 @@ import csv from datetime import datetime from inspect import signature -from io import StringIO +from io import BytesIO, StringIO import os import platform from urllib.error import URLError @@ -2253,3 +2253,60 @@ def test_dict_keys_as_names(all_parsers): result = parser.read_csv(StringIO(data), names=keys) expected = DataFrame({"a": [1], "b": [2]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("io_class", [StringIO, BytesIO]) +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_read_csv_file_handle(all_parsers, io_class, encoding): + """ + Test whether read_csv does not close user-provided file handles. + + GH 36980 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + content = "a,b\n1,2" + if io_class == BytesIO: + content = content.encode("utf-8") + handle = io_class(content) + + tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) + assert not handle.closed + + +def test_memory_map_file_handle_silent_fallback(all_parsers, compression): + """ + Do not fail for buffers with memory_map=True (cannot memory map BytesIO). + + GH 37621 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + handle = BytesIO() + expected.to_csv(handle, index=False, compression=compression, mode="wb") + handle.seek(0) + + tm.assert_frame_equal( + parser.read_csv(handle, memory_map=True, compression=compression), + expected, + ) + + +def test_memory_map_compression(all_parsers, compression): + """ + Support memory map for compressed files. + + GH 37621 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + with tm.ensure_clean() as path: + expected.to_csv(path, index=False, compression=compression) + + tm.assert_frame_equal( + parser.read_csv(path, memory_map=True, compression=compression), + expected, + ) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 876696ecdad9c..e74265da3e966 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -152,14 +152,17 @@ def test_binary_mode_file_buffers( with open(fpath, mode="r", encoding=encoding) as fa: result = parser.read_csv(fa) + assert not fa.closed tm.assert_frame_equal(expected, result) with open(fpath, mode="rb") as fb: result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed tm.assert_frame_equal(expected, result) with open(fpath, mode="rb", buffering=0) as fb: result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed tm.assert_frame_equal(expected, result) @@ -199,6 +202,7 @@ def test_encoding_named_temp_file(all_parsers): result = parser.read_csv(f, encoding=encoding) tm.assert_frame_equal(result, expected) + assert not f.closed @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 1c2518646bb29..413b78a52ad38 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -31,13 +31,10 @@ def test_file_handle(self): reader = TextReader(f) reader.read() - def test_string_filename(self): - reader = TextReader(self.csv1, header=None) - reader.read() - def test_file_handle_mmap(self): + # this was never using memory_map=True with open(self.csv1, "rb") as f: - reader = TextReader(f, memory_map=True, header=None) + reader = TextReader(f, header=None) reader.read() def test_StringIO(self): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f37b0aabd3aed..d76a5a6f64055 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1253,17 +1253,32 @@ def test_append_all_nans(self, setup_path): store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df) - # Test to make sure defaults are to not drop. - # Corresponding to Issue 9382 + def test_store_dropna(self, setup_path): df_with_missing = DataFrame( - {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + {"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]}, + index=list("abc"), ) + df_without_missing = DataFrame( + {"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac") + ) + + # # Test to make sure defaults are to not drop. + # # Corresponding to Issue 9382 + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df", format="table") + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_with_missing, reloaded) with ensure_clean_path(setup_path) as path: - df_with_missing.to_hdf(path, "df_with_missing", format="table") - reloaded = read_hdf(path, "df_with_missing") + df_with_missing.to_hdf(path, "df", format="table", dropna=False) + reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df", format="table", dropna=True) + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_without_missing, reloaded) + def test_read_missing_key_close_store(self, setup_path): # GH 25766 with ensure_clean_path(setup_path) as path: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 31e9ad4cf4416..8d7d5d85cbb48 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -47,18 +47,18 @@ def test_compression_size(obj, method, compression_only): @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=compression_only) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed + handles = icom.get_handle(path, "w", compression=compression_only) + getattr(obj, method)(handles.handle) + assert not handles.handle.closed + handles.close() + assert handles.handle.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=None) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed + handles = icom.get_handle(path, "w", compression=None) + getattr(obj, method)(handles.handle) + assert not handles.handle.closed + handles.close() + assert handles.handle.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size @@ -111,10 +111,10 @@ def test_compression_warning(compression_only): columns=["X", "Y", "Z"], ) with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=compression_only) + handles = icom.get_handle(path, "w", compression=compression_only) with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): - with f: - df.to_csv(f, compression=compression_only) + df.to_csv(handles.handle, compression=compression_only) + handles.close() def test_compression_binary(compression_only): diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index 95d6dcbaf3baf..f3f09d7a42204 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -135,9 +135,14 @@ def test_cython_inner_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) -def test_left_join_indexer_unique(): +@pytest.mark.parametrize("readonly", [True, False]) +def test_left_join_indexer_unique(readonly): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + if readonly: + # GH#37312, GH#37264 + a.setflags(write=False) + b.setflags(write=False) result = libjoin.left_join_indexer_unique(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) diff --git a/pandas/tests/plotting/frame/__init__.py b/pandas/tests/plotting/frame/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/frame/test_frame.py similarity index 61% rename from pandas/tests/plotting/test_frame.py rename to pandas/tests/plotting/frame/test_frame.py index e6a61d35365a3..9aab765dca96b 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -167,74 +167,6 @@ def test_integer_array_plot(self): _check_plot_works(df.plot.scatter, x="x", y="y") _check_plot_works(df.plot.hexbin, x="x", y="y") - def test_mpl2_color_cycle_str(self): - # GH 15516 - df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) - colors = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"] - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always", "MatplotlibDeprecationWarning") - - for color in colors: - _check_plot_works(df.plot, color=color) - - # if warning is raised, check that it is the exact problematic one - # GH 36972 - if w: - match = "Support for uppercase single-letter colors is deprecated" - warning_message = str(w[0].message) - msg = "MatplotlibDeprecationWarning related to CN colors was raised" - assert match not in warning_message, msg - - def test_color_single_series_list(self): - # GH 3486 - df = DataFrame({"A": [1, 2, 3]}) - _check_plot_works(df.plot, color=["red"]) - - def test_rgb_tuple_color(self): - # GH 16695 - df = DataFrame({"x": [1, 2], "y": [3, 4]}) - _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0)) - _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0, 0.5)) - - def test_color_empty_string(self): - df = DataFrame(np.random.randn(10, 2)) - with pytest.raises(ValueError): - df.plot(color="") - - def test_color_and_style_arguments(self): - df = DataFrame({"x": [1, 2], "y": [3, 4]}) - # passing both 'color' and 'style' arguments should be allowed - # if there is no color symbol in the style strings: - ax = df.plot(color=["red", "black"], style=["-", "--"]) - # check that the linestyles are correctly set: - linestyle = [line.get_linestyle() for line in ax.lines] - assert linestyle == ["-", "--"] - # check that the colors are correctly set: - color = [line.get_color() for line in ax.lines] - assert color == ["red", "black"] - # passing both 'color' and 'style' arguments should not be allowed - # if there is a color symbol in the style strings: - with pytest.raises(ValueError): - df.plot(color=["red", "black"], style=["k-", "r--"]) - - @pytest.mark.parametrize( - "color, expected", - [ - ("green", ["green"] * 4), - (["yellow", "red", "green", "blue"], ["yellow", "red", "green", "blue"]), - ], - ) - def test_color_and_marker(self, color, expected): - # GH 21003 - df = DataFrame(np.random.random((7, 4))) - ax = df.plot(color=color, style="d--") - # check colors - result = [i.get_color() for i in ax.lines] - assert result == expected - # check markers and linestyles - assert all(i.get_linestyle() == "--" for i in ax.lines) - assert all(i.get_marker() == "d" for i in ax.lines) - def test_nonnumeric_exclude(self): df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}) ax = df.plot() @@ -404,412 +336,6 @@ def test_unsorted_index_lims(self): assert xmin <= np.nanmin(lines[0].get_data()[0]) assert xmax >= np.nanmax(lines[0].get_data()[0]) - @pytest.mark.slow - def test_subplots(self): - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - - for kind in ["bar", "barh", "line", "area"]: - axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) - self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - assert axes.shape == (3,) - - for ax, column in zip(axes, df.columns): - self._check_legend_labels(ax, labels=[pprint_thing(column)]) - - for ax in axes[:-2]: - self._check_visible(ax.xaxis) # xaxis must be visible for grid - self._check_visible(ax.get_xticklabels(), visible=False) - if not (kind == "bar" and self.mpl_ge_3_1_0): - # change https://github.com/pandas-dev/pandas/issues/26714 - self._check_visible(ax.get_xticklabels(minor=True), visible=False) - self._check_visible(ax.xaxis.get_label(), visible=False) - self._check_visible(ax.get_yticklabels()) - - self._check_visible(axes[-1].xaxis) - self._check_visible(axes[-1].get_xticklabels()) - self._check_visible(axes[-1].get_xticklabels(minor=True)) - self._check_visible(axes[-1].xaxis.get_label()) - self._check_visible(axes[-1].get_yticklabels()) - - axes = df.plot(kind=kind, subplots=True, sharex=False) - for ax in axes: - self._check_visible(ax.xaxis) - self._check_visible(ax.get_xticklabels()) - self._check_visible(ax.get_xticklabels(minor=True)) - self._check_visible(ax.xaxis.get_label()) - self._check_visible(ax.get_yticklabels()) - - axes = df.plot(kind=kind, subplots=True, legend=False) - for ax in axes: - assert ax.get_legend() is None - - def test_groupby_boxplot_sharey(self): - # https://github.com/pandas-dev/pandas/issues/20968 - # sharey can now be switched check whether the right - # pair of axes is turned on or off - - df = DataFrame( - { - "a": [-1.43, -0.15, -3.70, -1.43, -0.14], - "b": [0.56, 0.84, 0.29, 0.56, 0.85], - "c": [0, 1, 2, 3, 1], - }, - index=[0, 1, 2, 3, 4], - ) - - # behavior without keyword - axes = df.groupby("c").boxplot() - expected = [True, False, True, False] - self._assert_ytickslabels_visibility(axes, expected) - - # set sharey=True should be identical - axes = df.groupby("c").boxplot(sharey=True) - expected = [True, False, True, False] - self._assert_ytickslabels_visibility(axes, expected) - - # sharey=False, all yticklabels should be visible - axes = df.groupby("c").boxplot(sharey=False) - expected = [True, True, True, True] - self._assert_ytickslabels_visibility(axes, expected) - - def test_groupby_boxplot_sharex(self): - # https://github.com/pandas-dev/pandas/issues/20968 - # sharex can now be switched check whether the right - # pair of axes is turned on or off - - df = DataFrame( - { - "a": [-1.43, -0.15, -3.70, -1.43, -0.14], - "b": [0.56, 0.84, 0.29, 0.56, 0.85], - "c": [0, 1, 2, 3, 1], - }, - index=[0, 1, 2, 3, 4], - ) - - # behavior without keyword - axes = df.groupby("c").boxplot() - expected = [True, True, True, True] - self._assert_xtickslabels_visibility(axes, expected) - - # set sharex=False should be identical - axes = df.groupby("c").boxplot(sharex=False) - expected = [True, True, True, True] - self._assert_xtickslabels_visibility(axes, expected) - - # sharex=True, yticklabels should be visible - # only for bottom plots - axes = df.groupby("c").boxplot(sharex=True) - expected = [False, False, True, True] - self._assert_xtickslabels_visibility(axes, expected) - - @pytest.mark.slow - def test_subplots_timeseries(self): - idx = date_range(start="2014-07-01", freq="M", periods=10) - df = DataFrame(np.random.rand(10, 3), index=idx) - - for kind in ["line", "area"]: - axes = df.plot(kind=kind, subplots=True, sharex=True) - self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - - for ax in axes[:-2]: - # GH 7801 - self._check_visible(ax.xaxis) # xaxis must be visible for grid - self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible(ax.get_xticklabels(minor=True), visible=False) - self._check_visible(ax.xaxis.get_label(), visible=False) - self._check_visible(ax.get_yticklabels()) - - self._check_visible(axes[-1].xaxis) - self._check_visible(axes[-1].get_xticklabels()) - self._check_visible(axes[-1].get_xticklabels(minor=True)) - self._check_visible(axes[-1].xaxis.get_label()) - self._check_visible(axes[-1].get_yticklabels()) - self._check_ticks_props(axes, xrot=0) - - axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) - for ax in axes: - self._check_visible(ax.xaxis) - self._check_visible(ax.get_xticklabels()) - self._check_visible(ax.get_xticklabels(minor=True)) - self._check_visible(ax.xaxis.get_label()) - self._check_visible(ax.get_yticklabels()) - self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) - - def test_subplots_timeseries_y_axis(self): - # GH16953 - data = { - "numeric": np.array([1, 2, 5]), - "timedelta": [ - pd.Timedelta(-10, unit="s"), - pd.Timedelta(10, unit="m"), - pd.Timedelta(10, unit="h"), - ], - "datetime_no_tz": [ - pd.to_datetime("2017-08-01 00:00:00"), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00"), - ], - "datetime_all_tz": [ - pd.to_datetime("2017-08-01 00:00:00", utc=True), - pd.to_datetime("2017-08-01 02:00:00", utc=True), - pd.to_datetime("2017-08-02 00:00:00", utc=True), - ], - "text": ["This", "should", "fail"], - } - testdata = DataFrame(data) - - ax_numeric = testdata.plot(y="numeric") - assert ( - ax_numeric.get_lines()[0].get_data()[1] == testdata["numeric"].values - ).all() - ax_timedelta = testdata.plot(y="timedelta") - assert ( - ax_timedelta.get_lines()[0].get_data()[1] == testdata["timedelta"].values - ).all() - ax_datetime_no_tz = testdata.plot(y="datetime_no_tz") - assert ( - ax_datetime_no_tz.get_lines()[0].get_data()[1] - == testdata["datetime_no_tz"].values - ).all() - ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") - assert ( - ax_datetime_all_tz.get_lines()[0].get_data()[1] - == testdata["datetime_all_tz"].values - ).all() - - msg = "no numeric data to plot" - with pytest.raises(TypeError, match=msg): - testdata.plot(y="text") - - @pytest.mark.xfail(reason="not support for period, categorical, datetime_mixed_tz") - def test_subplots_timeseries_y_axis_not_supported(self): - """ - This test will fail for: - period: - since period isn't yet implemented in ``select_dtypes`` - and because it will need a custom value converter + - tick formatter (as was done for x-axis plots) - - categorical: - because it will need a custom value converter + - tick formatter (also doesn't work for x-axis, as of now) - - datetime_mixed_tz: - because of the way how pandas handles ``Series`` of - ``datetime`` objects with different timezone, - generally converting ``datetime`` objects in a tz-aware - form could help with this problem - """ - data = { - "numeric": np.array([1, 2, 5]), - "period": [ - pd.Period("2017-08-01 00:00:00", freq="H"), - pd.Period("2017-08-01 02:00", freq="H"), - pd.Period("2017-08-02 00:00:00", freq="H"), - ], - "categorical": pd.Categorical( - ["c", "b", "a"], categories=["a", "b", "c"], ordered=False - ), - "datetime_mixed_tz": [ - pd.to_datetime("2017-08-01 00:00:00", utc=True), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00"), - ], - } - testdata = DataFrame(data) - ax_period = testdata.plot(x="numeric", y="period") - assert ( - ax_period.get_lines()[0].get_data()[1] == testdata["period"].values - ).all() - ax_categorical = testdata.plot(x="numeric", y="categorical") - assert ( - ax_categorical.get_lines()[0].get_data()[1] - == testdata["categorical"].values - ).all() - ax_datetime_mixed_tz = testdata.plot(x="numeric", y="datetime_mixed_tz") - assert ( - ax_datetime_mixed_tz.get_lines()[0].get_data()[1] - == testdata["datetime_mixed_tz"].values - ).all() - - @pytest.mark.slow - def test_subplots_layout(self): - # GH 6667 - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - - axes = df.plot(subplots=True, layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - assert axes.shape == (2, 2) - - axes = df.plot(subplots=True, layout=(-1, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - assert axes.shape == (2, 2) - - axes = df.plot(subplots=True, layout=(2, -1)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - assert axes.shape == (2, 2) - - axes = df.plot(subplots=True, layout=(1, 4)) - self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) - assert axes.shape == (1, 4) - - axes = df.plot(subplots=True, layout=(-1, 4)) - self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) - assert axes.shape == (1, 4) - - axes = df.plot(subplots=True, layout=(4, -1)) - self._check_axes_shape(axes, axes_num=3, layout=(4, 1)) - assert axes.shape == (4, 1) - - with pytest.raises(ValueError): - df.plot(subplots=True, layout=(1, 1)) - with pytest.raises(ValueError): - df.plot(subplots=True, layout=(-1, -1)) - - # single column - df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) - axes = df.plot(subplots=True) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1,) - - axes = df.plot(subplots=True, layout=(3, 3)) - self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) - assert axes.shape == (3, 3) - - @pytest.mark.slow - def test_subplots_warnings(self): - # GH 9464 - with tm.assert_produces_warning(None): - df = DataFrame(np.random.randn(100, 4)) - df.plot(subplots=True, layout=(3, 2)) - - df = DataFrame( - np.random.randn(100, 4), index=date_range("1/1/2000", periods=100) - ) - df.plot(subplots=True, layout=(3, 2)) - - @pytest.mark.slow - def test_subplots_multiple_axes(self): - # GH 5353, 6970, GH 7069 - fig, axes = self.plt.subplots(2, 3) - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - - returned = df.plot(subplots=True, ax=axes[0], sharex=False, sharey=False) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3,) - assert returned[0].figure is fig - # draw on second row - returned = df.plot(subplots=True, ax=axes[1], sharex=False, sharey=False) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3,) - assert returned[0].figure is fig - self._check_axes_shape(axes, axes_num=6, layout=(2, 3)) - tm.close() - - with pytest.raises(ValueError): - fig, axes = self.plt.subplots(2, 3) - # pass different number of axes from required - df.plot(subplots=True, ax=axes) - - # pass 2-dim axes and invalid layout - # invalid lauout should not affect to input and return value - # (show warning is tested in - # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes - fig, axes = self.plt.subplots(2, 2) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - df = DataFrame(np.random.rand(10, 4), index=list(string.ascii_letters[:10])) - - returned = df.plot( - subplots=True, ax=axes, layout=(2, 1), sharex=False, sharey=False - ) - self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4,) - - returned = df.plot( - subplots=True, ax=axes, layout=(2, -1), sharex=False, sharey=False - ) - self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4,) - - returned = df.plot( - subplots=True, ax=axes, layout=(-1, 2), sharex=False, sharey=False - ) - self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4,) - - # single column - fig, axes = self.plt.subplots(1, 1) - df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) - - axes = df.plot(subplots=True, ax=[axes], sharex=False, sharey=False) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1,) - - def test_subplots_ts_share_axes(self): - # GH 3964 - fig, axes = self.plt.subplots(3, 3, sharex=True, sharey=True) - self.plt.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) - df = DataFrame( - np.random.randn(10, 9), - index=date_range(start="2014-07-01", freq="M", periods=10), - ) - for i, ax in enumerate(axes.ravel()): - df[i].plot(ax=ax, fontsize=5) - - # Rows other than bottom should not be visible - for ax in axes[0:-1].ravel(): - self._check_visible(ax.get_xticklabels(), visible=False) - - # Bottom row should be visible - for ax in axes[-1].ravel(): - self._check_visible(ax.get_xticklabels(), visible=True) - - # First column should be visible - for ax in axes[[0, 1, 2], [0]].ravel(): - self._check_visible(ax.get_yticklabels(), visible=True) - - # Other columns should not be visible - for ax in axes[[0, 1, 2], [1]].ravel(): - self._check_visible(ax.get_yticklabels(), visible=False) - for ax in axes[[0, 1, 2], [2]].ravel(): - self._check_visible(ax.get_yticklabels(), visible=False) - - def test_subplots_sharex_axes_existing_axes(self): - # GH 9158 - d = {"A": [1.0, 2.0, 3.0, 4.0], "B": [4.0, 3.0, 2.0, 1.0], "C": [5, 1, 3, 4]} - df = DataFrame(d, index=date_range("2014 10 11", "2014 10 14")) - - axes = df[["A", "B"]].plot(subplots=True) - df["C"].plot(ax=axes[0], secondary_y=True) - - self._check_visible(axes[0].get_xticklabels(), visible=False) - self._check_visible(axes[1].get_xticklabels(), visible=True) - for ax in axes.ravel(): - self._check_visible(ax.get_yticklabels(), visible=True) - - @pytest.mark.slow - def test_subplots_dup_columns(self): - # GH 10962 - df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa")) - axes = df.plot(subplots=True) - for ax in axes: - self._check_legend_labels(ax, labels=["a"]) - assert len(ax.lines) == 1 - tm.close() - - axes = df.plot(subplots=True, secondary_y="a") - for ax in axes: - # (right) is only attached when subplots=False - self._check_legend_labels(ax, labels=["a"]) - assert len(ax.lines) == 1 - tm.close() - - ax = df.plot(secondary_y="a") - self._check_legend_labels(ax, labels=["a (right)"] * 5) - assert len(ax.lines) == 0 - assert len(ax.right_ax.lines) == 5 - def test_negative_log(self): df = -DataFrame( np.random.rand(6, 4), @@ -952,60 +478,6 @@ def test_area_lim(self): ymin, ymax = ax.get_ylim() assert ymax == 0 - @pytest.mark.slow - def test_bar_colors(self): - import matplotlib.pyplot as plt - - default_colors = self._unpack_cycler(plt.rcParams) - - df = DataFrame(np.random.randn(5, 5)) - ax = df.plot.bar() - self._check_colors(ax.patches[::5], facecolors=default_colors[:5]) - tm.close() - - custom_colors = "rgcby" - ax = df.plot.bar(color=custom_colors) - self._check_colors(ax.patches[::5], facecolors=custom_colors) - tm.close() - - from matplotlib import cm - - # Test str -> colormap functionality - ax = df.plot.bar(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::5], facecolors=rgba_colors) - tm.close() - - # Test colormap functionality - ax = df.plot.bar(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::5], facecolors=rgba_colors) - tm.close() - - ax = df.loc[:, [0]].plot.bar(color="DodgerBlue") - self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) - tm.close() - - ax = df.plot(kind="bar", color="green") - self._check_colors(ax.patches[::5], facecolors=["green"] * 5) - tm.close() - - def test_bar_user_colors(self): - df = DataFrame( - {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} - ) - # This should *only* work when `y` is specified, else - # we use one color per column - ax = df.plot.bar(y="A", color=df["color"]) - result = [p.get_facecolor() for p in ax.patches] - expected = [ - (1.0, 0.0, 0.0, 1.0), - (0.0, 0.0, 1.0, 1.0), - (0.0, 0.0, 1.0, 1.0), - (1.0, 0.0, 0.0, 1.0), - ] - assert result == expected - @pytest.mark.slow def test_bar_linewidth(self): df = DataFrame(np.random.randn(5, 5)) @@ -1065,46 +537,6 @@ def test_bar_barwidth(self): for r in ax.patches: assert r.get_height() == width - @pytest.mark.slow - def test_bar_barwidth_position(self): - df = DataFrame(np.random.randn(5, 5)) - self._check_bar_alignment( - df, kind="bar", stacked=False, width=0.9, position=0.2 - ) - self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, position=0.2) - self._check_bar_alignment( - df, kind="barh", stacked=False, width=0.9, position=0.2 - ) - self._check_bar_alignment( - df, kind="barh", stacked=True, width=0.9, position=0.2 - ) - self._check_bar_alignment( - df, kind="bar", subplots=True, width=0.9, position=0.2 - ) - self._check_bar_alignment( - df, kind="barh", subplots=True, width=0.9, position=0.2 - ) - - @pytest.mark.slow - def test_bar_barwidth_position_int(self): - # GH 12979 - df = DataFrame(np.random.randn(5, 5)) - - for w in [1, 1.0]: - ax = df.plot.bar(stacked=True, width=w) - ticks = ax.xaxis.get_ticklocs() - tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4])) - assert ax.get_xlim() == (-0.75, 4.75) - # check left-edge of bars - assert ax.patches[0].get_x() == -0.5 - assert ax.patches[-1].get_x() == 3.5 - - self._check_bar_alignment(df, kind="bar", stacked=True, width=1) - self._check_bar_alignment(df, kind="barh", stacked=False, width=1) - self._check_bar_alignment(df, kind="barh", stacked=True, width=1) - self._check_bar_alignment(df, kind="bar", subplots=True, width=1) - self._check_bar_alignment(df, kind="barh", subplots=True, width=1) - @pytest.mark.slow def test_bar_bottom_left(self): df = DataFrame(np.random.rand(5, 5)) @@ -1230,29 +662,6 @@ def test_scatterplot_object_data(self): _check_plot_works(df.plot.scatter, x="a", y="b") _check_plot_works(df.plot.scatter, x=0, y=1) - @pytest.mark.slow - def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): - # addressing issue #10611, to ensure colobar does not - # interfere with x-axis label and ticklabels with - # ipython inline backend. - random_array = np.random.random((1000, 3)) - df = DataFrame(random_array, columns=["A label", "B label", "C label"]) - - ax1 = df.plot.scatter(x="A label", y="B label") - ax2 = df.plot.scatter(x="A label", y="B label", c="C label") - - vis1 = [vis.get_visible() for vis in ax1.xaxis.get_minorticklabels()] - vis2 = [vis.get_visible() for vis in ax2.xaxis.get_minorticklabels()] - assert vis1 == vis2 - - vis1 = [vis.get_visible() for vis in ax1.xaxis.get_majorticklabels()] - vis2 = [vis.get_visible() for vis in ax2.xaxis.get_majorticklabels()] - assert vis1 == vis2 - - assert ( - ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible() - ) - @pytest.mark.slow def test_if_hexbin_xaxis_label_is_visible(self): # addressing issue #10678, to ensure colobar does not @@ -1266,24 +675,6 @@ def test_if_hexbin_xaxis_label_is_visible(self): assert all(vis.get_visible() for vis in ax.xaxis.get_majorticklabels()) assert ax.xaxis.get_label().get_visible() - @pytest.mark.slow - def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): - import matplotlib.pyplot as plt - - random_array = np.random.random((1000, 3)) - df = DataFrame(random_array, columns=["A label", "B label", "C label"]) - - fig, axes = plt.subplots(1, 2) - df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) - df.plot.scatter("A label", "B label", c="C label", ax=axes[1]) - plt.tight_layout() - - points = np.array([ax.get_position().get_points() for ax in fig.axes]) - axes_x_coords = points[:, :, 0] - parent_distance = axes_x_coords[1, :] - axes_x_coords[0, :] - colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] - assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() - @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) @pytest.mark.slow def test_plot_scatter_with_categorical_data(self, x, y): @@ -1344,17 +735,6 @@ def test_plot_scatter_with_c(self): float_array = np.array([0.0, 1.0]) df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") - @pytest.mark.parametrize("cmap", [None, "Greys"]) - def test_scatter_with_c_column_name_with_colors(self, cmap): - # https://github.com/pandas-dev/pandas/issues/34316 - df = DataFrame( - [[5.1, 3.5], [4.9, 3.0], [7.0, 3.2], [6.4, 3.2], [5.9, 3.0]], - columns=["length", "width"], - ) - df["species"] = ["r", "r", "g", "g", "b"] - ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap) - assert ax.collections[0].colorbar is None - def test_plot_scatter_with_s(self): # this refers to GH 32904 df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"]) @@ -1362,39 +742,6 @@ def test_plot_scatter_with_s(self): ax = df.plot.scatter(x="a", y="b", s="c") tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes()) - def test_scatter_colors(self): - df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) - with pytest.raises(TypeError): - df.plot.scatter(x="a", y="b", c="c", color="green") - - default_colors = self._unpack_cycler(self.plt.rcParams) - - ax = df.plot.scatter(x="a", y="b", c="c") - tm.assert_numpy_array_equal( - ax.collections[0].get_facecolor()[0], - np.array(self.colorconverter.to_rgba(default_colors[0])), - ) - - ax = df.plot.scatter(x="a", y="b", color="white") - tm.assert_numpy_array_equal( - ax.collections[0].get_facecolor()[0], - np.array([1, 1, 1, 1], dtype=np.float64), - ) - - def test_scatter_colorbar_different_cmap(self): - # GH 33389 - import matplotlib.pyplot as plt - - df = DataFrame({"x": [1, 2, 3], "y": [1, 3, 2], "c": [1, 2, 3]}) - df["x2"] = df["x"] + 1 - - fig, ax = plt.subplots() - df.plot("x", "y", c="c", kind="scatter", cmap="cividis", ax=ax) - df.plot("x2", "y", c="c", kind="scatter", cmap="magma", ax=ax) - - assert ax.collections[0].cmap.name == "cividis" - assert ax.collections[1].cmap.name == "magma" - @pytest.mark.slow def test_plot_bar(self): df = DataFrame( @@ -1430,164 +777,6 @@ def test_plot_bar(self): ax = df.plot.barh(rot=55, fontsize=11) self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11) - def _check_bar_alignment( - self, - df, - kind="bar", - stacked=False, - subplots=False, - align="center", - width=0.5, - position=0.5, - ): - - axes = df.plot( - kind=kind, - stacked=stacked, - subplots=subplots, - align=align, - width=width, - position=position, - grid=True, - ) - - axes = self._flatten_visible(axes) - - for ax in axes: - if kind == "bar": - axis = ax.xaxis - ax_min, ax_max = ax.get_xlim() - min_edge = min(p.get_x() for p in ax.patches) - max_edge = max(p.get_x() + p.get_width() for p in ax.patches) - elif kind == "barh": - axis = ax.yaxis - ax_min, ax_max = ax.get_ylim() - min_edge = min(p.get_y() for p in ax.patches) - max_edge = max(p.get_y() + p.get_height() for p in ax.patches) - else: - raise ValueError - - # GH 7498 - # compare margins between lim and bar edges - tm.assert_almost_equal(ax_min, min_edge - 0.25) - tm.assert_almost_equal(ax_max, max_edge + 0.25) - - p = ax.patches[0] - if kind == "bar" and (stacked is True or subplots is True): - edge = p.get_x() - center = edge + p.get_width() * position - elif kind == "bar" and stacked is False: - center = p.get_x() + p.get_width() * len(df.columns) * position - edge = p.get_x() - elif kind == "barh" and (stacked is True or subplots is True): - center = p.get_y() + p.get_height() * position - edge = p.get_y() - elif kind == "barh" and stacked is False: - center = p.get_y() + p.get_height() * len(df.columns) * position - edge = p.get_y() - else: - raise ValueError - - # Check the ticks locates on integer - assert (axis.get_ticklocs() == np.arange(len(df))).all() - - if align == "center": - # Check whether the bar locates on center - tm.assert_almost_equal(axis.get_ticklocs()[0], center) - elif align == "edge": - # Check whether the bar's edge starts from the tick - tm.assert_almost_equal(axis.get_ticklocs()[0], edge) - else: - raise ValueError - - return axes - - @pytest.mark.slow - def test_bar_stacked_center(self): - # GH2157 - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind="bar", stacked=True) - self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9) - self._check_bar_alignment(df, kind="barh", stacked=True) - self._check_bar_alignment(df, kind="barh", stacked=True, width=0.9) - - @pytest.mark.slow - def test_bar_center(self): - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind="bar", stacked=False) - self._check_bar_alignment(df, kind="bar", stacked=False, width=0.9) - self._check_bar_alignment(df, kind="barh", stacked=False) - self._check_bar_alignment(df, kind="barh", stacked=False, width=0.9) - - @pytest.mark.slow - def test_bar_subplots_center(self): - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind="bar", subplots=True) - self._check_bar_alignment(df, kind="bar", subplots=True, width=0.9) - self._check_bar_alignment(df, kind="barh", subplots=True) - self._check_bar_alignment(df, kind="barh", subplots=True, width=0.9) - - @pytest.mark.slow - def test_bar_align_single_column(self): - df = DataFrame(np.random.randn(5)) - self._check_bar_alignment(df, kind="bar", stacked=False) - self._check_bar_alignment(df, kind="bar", stacked=True) - self._check_bar_alignment(df, kind="barh", stacked=False) - self._check_bar_alignment(df, kind="barh", stacked=True) - self._check_bar_alignment(df, kind="bar", subplots=True) - self._check_bar_alignment(df, kind="barh", subplots=True) - - @pytest.mark.slow - def test_bar_edge(self): - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - - self._check_bar_alignment(df, kind="bar", stacked=True, align="edge") - self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, align="edge") - self._check_bar_alignment(df, kind="barh", stacked=True, align="edge") - self._check_bar_alignment( - df, kind="barh", stacked=True, width=0.9, align="edge" - ) - - self._check_bar_alignment(df, kind="bar", stacked=False, align="edge") - self._check_bar_alignment( - df, kind="bar", stacked=False, width=0.9, align="edge" - ) - self._check_bar_alignment(df, kind="barh", stacked=False, align="edge") - self._check_bar_alignment( - df, kind="barh", stacked=False, width=0.9, align="edge" - ) - - self._check_bar_alignment(df, kind="bar", subplots=True, align="edge") - self._check_bar_alignment( - df, kind="bar", subplots=True, width=0.9, align="edge" - ) - self._check_bar_alignment(df, kind="barh", subplots=True, align="edge") - self._check_bar_alignment( - df, kind="barh", subplots=True, width=0.9, align="edge" - ) - - @pytest.mark.slow - def test_bar_log_no_subplots(self): - # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 - # regressions in 1.2.1 - expected = np.array([0.1, 1.0, 10.0, 100]) - - # no subplots - df = DataFrame({"A": [3] * 5, "B": list(range(1, 6))}, index=range(5)) - ax = df.plot.bar(grid=True, log=True) - tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) - - @pytest.mark.slow - def test_bar_log_subplots(self): - expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]) - - ax = DataFrame([Series([200, 300]), Series([300, 500])]).plot.bar( - log=True, subplots=True - ) - - tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) - tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) - @pytest.mark.slow def test_boxplot(self): df = self.hist_df @@ -1666,26 +855,6 @@ def test_boxplot_return_type(self): result = df.plot.box(return_type="both") self._check_box_return_type(result, "both") - @pytest.mark.slow - def test_boxplot_subplots_return_type(self): - df = self.hist_df - - # normal style: return_type=None - result = df.plot.box(subplots=True) - assert isinstance(result, Series) - self._check_box_return_type( - result, None, expected_keys=["height", "weight", "category"] - ) - - for t in ["dict", "axes", "both"]: - returned = df.plot.box(return_type=t, subplots=True) - self._check_box_return_type( - returned, - t, - expected_keys=["height", "weight", "category"], - check_ax_title=False, - ) - @pytest.mark.slow @td.skip_if_no_scipy def test_kde_df(self): @@ -2079,353 +1248,6 @@ def test_line_label_none(self): ax = s.plot(legend=True) assert ax.get_legend().get_texts()[0].get_text() == "None" - @pytest.mark.slow - def test_line_colors(self): - from matplotlib import cm - - custom_colors = "rgcby" - df = DataFrame(np.random.randn(5, 5)) - - ax = df.plot(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - - tm.close() - - ax2 = df.plot(color=custom_colors) - lines2 = ax2.get_lines() - - for l1, l2 in zip(ax.get_lines(), lines2): - assert l1.get_color() == l2.get_color() - - tm.close() - - ax = df.plot(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - tm.close() - - ax = df.plot(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - tm.close() - - # make color a list if plotting one column frame - # handles cases like df.plot(color='DodgerBlue') - ax = df.loc[:, [0]].plot(color="DodgerBlue") - self._check_colors(ax.lines, linecolors=["DodgerBlue"]) - - ax = df.plot(color="red") - self._check_colors(ax.get_lines(), linecolors=["red"] * 5) - tm.close() - - # GH 10299 - custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] - ax = df.plot(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - tm.close() - - @pytest.mark.slow - def test_dont_modify_colors(self): - colors = ["r", "g", "b"] - DataFrame(np.random.rand(10, 2)).plot(color=colors) - assert len(colors) == 3 - - @pytest.mark.slow - def test_line_colors_and_styles_subplots(self): - # GH 9894 - from matplotlib import cm - - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(np.random.randn(5, 5)) - - axes = df.plot(subplots=True) - for ax, c in zip(axes, list(default_colors)): - c = [c] - self._check_colors(ax.get_lines(), linecolors=c) - tm.close() - - # single color char - axes = df.plot(subplots=True, color="k") - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["k"]) - tm.close() - - # single color str - axes = df.plot(subplots=True, color="green") - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["green"]) - tm.close() - - custom_colors = "rgcby" - axes = df.plot(color=custom_colors, subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - axes = df.plot(color=list(custom_colors), subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # GH 10299 - custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] - axes = df.plot(color=custom_colors, subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ["jet", cm.jet]: - axes = df.plot(colormap=cmap, subplots=True) - for ax, c in zip(axes, rgba_colors): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # make color a list if plotting one column frame - # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(color="DodgerBlue", subplots=True) - self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) - - # single character style - axes = df.plot(style="r", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["r"]) - tm.close() - - # list of styles - styles = list("rgcby") - axes = df.plot(style=styles, subplots=True) - for ax, c in zip(axes, styles): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - @pytest.mark.slow - def test_area_colors(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - - custom_colors = "rgcby" - df = DataFrame(np.random.rand(5, 5)) - - ax = df.plot.area(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] - self._check_colors(poly, facecolors=custom_colors) - - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, facecolors=custom_colors) - - for h in handles: - assert h.get_alpha() is None - tm.close() - - ax = df.plot.area(colormap="jet") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] - self._check_colors(poly, facecolors=jet_colors) - - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, facecolors=jet_colors) - for h in handles: - assert h.get_alpha() is None - tm.close() - - # When stacked=False, alpha is set to 0.5 - ax = df.plot.area(colormap=cm.jet, stacked=False) - self._check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] - jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] - self._check_colors(poly, facecolors=jet_with_alpha) - - handles, labels = ax.get_legend_handles_labels() - linecolors = jet_with_alpha - self._check_colors(handles[: len(jet_colors)], linecolors=linecolors) - for h in handles: - assert h.get_alpha() == 0.5 - - @pytest.mark.slow - def test_hist_colors(self): - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(np.random.randn(5, 5)) - ax = df.plot.hist() - self._check_colors(ax.patches[::10], facecolors=default_colors[:5]) - tm.close() - - custom_colors = "rgcby" - ax = df.plot.hist(color=custom_colors) - self._check_colors(ax.patches[::10], facecolors=custom_colors) - tm.close() - - from matplotlib import cm - - # Test str -> colormap functionality - ax = df.plot.hist(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::10], facecolors=rgba_colors) - tm.close() - - # Test colormap functionality - ax = df.plot.hist(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::10], facecolors=rgba_colors) - tm.close() - - ax = df.loc[:, [0]].plot.hist(color="DodgerBlue") - self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) - - ax = df.plot(kind="hist", color="green") - self._check_colors(ax.patches[::10], facecolors=["green"] * 5) - tm.close() - - @pytest.mark.slow - @td.skip_if_no_scipy - def test_kde_colors(self): - from matplotlib import cm - - custom_colors = "rgcby" - df = DataFrame(np.random.rand(5, 5)) - - ax = df.plot.kde(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - tm.close() - - ax = df.plot.kde(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - tm.close() - - ax = df.plot.kde(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - - @pytest.mark.slow - @td.skip_if_no_scipy - def test_kde_colors_and_styles_subplots(self): - from matplotlib import cm - - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(np.random.randn(5, 5)) - - axes = df.plot(kind="kde", subplots=True) - for ax, c in zip(axes, list(default_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # single color char - axes = df.plot(kind="kde", color="k", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["k"]) - tm.close() - - # single color str - axes = df.plot(kind="kde", color="red", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["red"]) - tm.close() - - custom_colors = "rgcby" - axes = df.plot(kind="kde", color=custom_colors, subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ["jet", cm.jet]: - axes = df.plot(kind="kde", colormap=cmap, subplots=True) - for ax, c in zip(axes, rgba_colors): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # make color a list if plotting one column frame - # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(kind="kde", color="DodgerBlue", subplots=True) - self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) - - # single character style - axes = df.plot(kind="kde", style="r", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["r"]) - tm.close() - - # list of styles - styles = list("rgcby") - axes = df.plot(kind="kde", style=styles, subplots=True) - for ax, c in zip(axes, styles): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - @pytest.mark.slow - def test_boxplot_colors(self): - def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): - # TODO: outside this func? - if fliers_c is None: - fliers_c = "k" - self._check_colors(bp["boxes"], linecolors=[box_c] * len(bp["boxes"])) - self._check_colors( - bp["whiskers"], linecolors=[whiskers_c] * len(bp["whiskers"]) - ) - self._check_colors( - bp["medians"], linecolors=[medians_c] * len(bp["medians"]) - ) - self._check_colors(bp["fliers"], linecolors=[fliers_c] * len(bp["fliers"])) - self._check_colors(bp["caps"], linecolors=[caps_c] * len(bp["caps"])) - - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(np.random.randn(5, 5)) - bp = df.plot.box(return_type="dict") - _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) - tm.close() - - dict_colors = dict( - boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456" - ) - bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict") - _check_colors( - bp, - dict_colors["boxes"], - dict_colors["whiskers"], - dict_colors["medians"], - dict_colors["caps"], - "r", - ) - tm.close() - - # partial colors - dict_colors = dict(whiskers="c", medians="m") - bp = df.plot.box(color=dict_colors, return_type="dict") - _check_colors(bp, default_colors[0], "c", "m") - tm.close() - - from matplotlib import cm - - # Test str -> colormap functionality - bp = df.plot.box(colormap="jet", return_type="dict") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, 3)] - _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) - tm.close() - - # Test colormap functionality - bp = df.plot.box(colormap=cm.jet, return_type="dict") - _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) - tm.close() - - # string color is applied to all artists except fliers - bp = df.plot.box(color="DodgerBlue", return_type="dict") - _check_colors(bp, "DodgerBlue", "DodgerBlue", "DodgerBlue", "DodgerBlue") - - # tuple is also applied to all artists except fliers - bp = df.plot.box(color=(0, 1, 0), sym="#123456", return_type="dict") - _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), "#123456") - - with pytest.raises(ValueError): - # Color contains invalid key results in ValueError - df.plot.box(color=dict(boxes="red", xxxx="blue")) - @pytest.mark.parametrize( "props, expected", [ @@ -2443,19 +1265,6 @@ def test_specified_props_kwd_plot_box(self, props, expected): assert result[expected][0].get_color() == "C1" - def test_default_color_cycle(self): - import cycler - import matplotlib.pyplot as plt - - colors = list("rgbk") - plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) - - df = DataFrame(np.random.randn(5, 3)) - ax = df.plot() - - expected = self._unpack_cycler(plt.rcParams)[:3] - self._check_colors(ax.get_lines(), linecolors=expected) - def test_unordered_ts(self): df = DataFrame( np.array([3.0, 2.0, 1.0]), @@ -2595,13 +1404,6 @@ def test_hexbin_cmap(self): ax = df.plot.hexbin(x="A", y="B", colormap=cm) assert ax.collections[0].cmap.name == cm - @pytest.mark.slow - def test_no_color_bar(self): - df = self.hexbin_df - - ax = df.plot.hexbin(x="A", y="B", colorbar=None) - assert ax.collections[0].colorbar is None - @pytest.mark.slow def test_allow_cmap(self): df = self.hexbin_df @@ -3041,53 +1843,6 @@ def test_memory_leak(self): # need to actually access something to get an error results[key].lines - @pytest.mark.slow - def test_df_subplots_patterns_minorticks(self): - # GH 10657 - import matplotlib.pyplot as plt - - df = DataFrame( - np.random.randn(10, 2), - index=date_range("1/1/2000", periods=10), - columns=list("AB"), - ) - - # shared subplots - fig, axes = plt.subplots(2, 1, sharex=True) - axes = df.plot(subplots=True, ax=axes) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - # xaxis of 1st ax must be hidden - self._check_visible(axes[0].get_xticklabels(), visible=False) - self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) - self._check_visible(axes[1].get_xticklabels(), visible=True) - self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) - tm.close() - - fig, axes = plt.subplots(2, 1) - with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=axes, sharex=True) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - # xaxis of 1st ax must be hidden - self._check_visible(axes[0].get_xticklabels(), visible=False) - self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) - self._check_visible(axes[1].get_xticklabels(), visible=True) - self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) - tm.close() - - # not shared - fig, axes = plt.subplots(2, 1) - axes = df.plot(subplots=True, ax=axes) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - @pytest.mark.slow def test_df_gridspec_patterns(self): # GH 10819 @@ -3213,12 +1968,6 @@ def test_df_grid_settings(self): kws={"x": "a", "y": "b"}, ) - def test_invalid_colormap(self): - df = DataFrame(np.random.randn(3, 2), columns=["A", "B"]) - - with pytest.raises(ValueError): - df.plot(colormap="invalid_colormap") - def test_plain_axes(self): # supplied ax itself is a SubplotAxes, but figure contains also @@ -3250,22 +1999,6 @@ def test_plain_axes(self): Series(np.random.rand(10)).plot(ax=ax) Series(np.random.rand(10)).plot(ax=iax) - def test_passed_bar_colors(self): - import matplotlib as mpl - - color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] - colormap = mpl.colors.ListedColormap(color_tuples) - barplot = DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) - assert color_tuples == [c.get_facecolor() for c in barplot.patches] - - def test_rcParams_bar_colors(self): - import matplotlib as mpl - - color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] - with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): - barplot = DataFrame([[1, 2, 3]]).plot(kind="bar") - assert color_tuples == [c.get_facecolor() for c in barplot.patches] - @pytest.mark.parametrize("method", ["line", "barh", "bar"]) def test_secondary_axis_font_size(self, method): # GH: 12565 @@ -3354,22 +2087,6 @@ def test_xlim_plot_line_correctly_in_mixed_plot_type(self): xticklabels = [t.get_text() for t in ax.get_xticklabels()] assert xticklabels == indexes - def test_subplots_sharex_false(self): - # test when sharex is set to False, two plots should have different - # labels, GH 25160 - df = DataFrame(np.random.rand(10, 2)) - df.iloc[5:, 1] = np.nan - df.iloc[:5, 0] = np.nan - - figs, axs = self.plt.subplots(2, 1) - df.plot.line(ax=axs, subplots=True, sharex=False) - - expected_ax1 = np.arange(4.5, 10, 0.5) - expected_ax2 = np.arange(-0.5, 5, 0.5) - - tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) - tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) - def test_plot_no_rows(self): # GH 27758 df = DataFrame(columns=["foo"], dtype=int) @@ -3413,16 +2130,6 @@ def test_missing_markers_legend_using_style(self): self._check_legend_labels(ax, labels=["A", "B", "C"]) self._check_legend_marker(ax, expected_markers=[".", ".", "."]) - def test_colors_of_columns_with_same_name(self): - # ISSUE 11136 -> https://github.com/pandas-dev/pandas/issues/11136 - # Creating a DataFrame with duplicate column labels and testing colors of them. - df = DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) - df1 = DataFrame({"a": [2, 4, 6]}) - df_concat = pd.concat([df, df1], axis=1) - result = df_concat.plot() - for legend, line in zip(result.get_legend().legendHandles, result.lines): - assert legend.get_color() == line.get_color() - @pytest.mark.parametrize( "index_name, old_label, new_label", [ @@ -3472,34 +2179,6 @@ def test_xlabel_ylabel_dataframe_plane_plot(self, kind, xlabel, ylabel): assert ax.get_xlabel() == (xcol if xlabel is None else xlabel) assert ax.get_ylabel() == (ycol if ylabel is None else ylabel) - @pytest.mark.parametrize( - "index_name, old_label, new_label", - [ - (None, "", "new"), - ("old", "old", "new"), - (None, "", ""), - (None, "", 1), - (None, "", [1, 2]), - ], - ) - @pytest.mark.parametrize("kind", ["line", "area", "bar"]) - def test_xlabel_ylabel_dataframe_subplots( - self, kind, index_name, old_label, new_label - ): - # GH 9093 - df = DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) - df.index.name = index_name - - # default is the ylabel is not shown and xlabel is index name - axes = df.plot(kind=kind, subplots=True) - assert all(ax.get_ylabel() == "" for ax in axes) - assert all(ax.get_xlabel() == old_label for ax in axes) - - # old xlabel will be overriden and assigned ylabel will be used as ylabel - axes = df.plot(kind=kind, ylabel=new_label, xlabel=new_label, subplots=True) - assert all(ax.get_ylabel() == str(new_label) for ax in axes) - assert all(ax.get_xlabel() == str(new_label) for ax in axes) - def _generate_4_axes_via_gridspec(): import matplotlib as mpl diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py new file mode 100644 index 0000000000000..2d509e8f3b320 --- /dev/null +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -0,0 +1,645 @@ +""" Test cases for DataFrame.plot """ + +import warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works + + +@td.skip_if_no_mpl +class TestDataFramePlotsColor(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + + self.tdf = tm.makeTimeDataFrame() + self.hexbin_df = DataFrame( + { + "A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20), + } + ) + + def _assert_ytickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_yticklabels(), visible=exp) + + def _assert_xtickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_xticklabels(), visible=exp) + + def test_mpl2_color_cycle_str(self): + # GH 15516 + df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) + colors = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"] + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always", "MatplotlibDeprecationWarning") + + for color in colors: + _check_plot_works(df.plot, color=color) + + # if warning is raised, check that it is the exact problematic one + # GH 36972 + if w: + match = "Support for uppercase single-letter colors is deprecated" + warning_message = str(w[0].message) + msg = "MatplotlibDeprecationWarning related to CN colors was raised" + assert match not in warning_message, msg + + def test_color_single_series_list(self): + # GH 3486 + df = DataFrame({"A": [1, 2, 3]}) + _check_plot_works(df.plot, color=["red"]) + + def test_rgb_tuple_color(self): + # GH 16695 + df = DataFrame({"x": [1, 2], "y": [3, 4]}) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0)) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0, 0.5)) + + def test_color_empty_string(self): + df = DataFrame(np.random.randn(10, 2)) + with pytest.raises(ValueError): + df.plot(color="") + + def test_color_and_style_arguments(self): + df = DataFrame({"x": [1, 2], "y": [3, 4]}) + # passing both 'color' and 'style' arguments should be allowed + # if there is no color symbol in the style strings: + ax = df.plot(color=["red", "black"], style=["-", "--"]) + # check that the linestyles are correctly set: + linestyle = [line.get_linestyle() for line in ax.lines] + assert linestyle == ["-", "--"] + # check that the colors are correctly set: + color = [line.get_color() for line in ax.lines] + assert color == ["red", "black"] + # passing both 'color' and 'style' arguments should not be allowed + # if there is a color symbol in the style strings: + with pytest.raises(ValueError): + df.plot(color=["red", "black"], style=["k-", "r--"]) + + @pytest.mark.parametrize( + "color, expected", + [ + ("green", ["green"] * 4), + (["yellow", "red", "green", "blue"], ["yellow", "red", "green", "blue"]), + ], + ) + def test_color_and_marker(self, color, expected): + # GH 21003 + df = DataFrame(np.random.random((7, 4))) + ax = df.plot(color=color, style="d--") + # check colors + result = [i.get_color() for i in ax.lines] + assert result == expected + # check markers and linestyles + assert all(i.get_linestyle() == "--" for i in ax.lines) + assert all(i.get_marker() == "d" for i in ax.lines) + + @pytest.mark.slow + def test_bar_colors(self): + import matplotlib.pyplot as plt + + default_colors = self._unpack_cycler(plt.rcParams) + + df = DataFrame(np.random.randn(5, 5)) + ax = df.plot.bar() + self._check_colors(ax.patches[::5], facecolors=default_colors[:5]) + tm.close() + + custom_colors = "rgcby" + ax = df.plot.bar(color=custom_colors) + self._check_colors(ax.patches[::5], facecolors=custom_colors) + tm.close() + + from matplotlib import cm + + # Test str -> colormap functionality + ax = df.plot.bar(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::5], facecolors=rgba_colors) + tm.close() + + # Test colormap functionality + ax = df.plot.bar(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::5], facecolors=rgba_colors) + tm.close() + + ax = df.loc[:, [0]].plot.bar(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) + tm.close() + + ax = df.plot(kind="bar", color="green") + self._check_colors(ax.patches[::5], facecolors=["green"] * 5) + tm.close() + + def test_bar_user_colors(self): + df = DataFrame( + {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} + ) + # This should *only* work when `y` is specified, else + # we use one color per column + ax = df.plot.bar(y="A", color=df["color"]) + result = [p.get_facecolor() for p in ax.patches] + expected = [ + (1.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (1.0, 0.0, 0.0, 1.0), + ] + assert result == expected + + @pytest.mark.slow + def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): + # addressing issue #10611, to ensure colobar does not + # interfere with x-axis label and ticklabels with + # ipython inline backend. + random_array = np.random.random((1000, 3)) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) + + ax1 = df.plot.scatter(x="A label", y="B label") + ax2 = df.plot.scatter(x="A label", y="B label", c="C label") + + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_minorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_minorticklabels()] + assert vis1 == vis2 + + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_majorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_majorticklabels()] + assert vis1 == vis2 + + assert ( + ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible() + ) + + @pytest.mark.slow + def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): + import matplotlib.pyplot as plt + + random_array = np.random.random((1000, 3)) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) + + fig, axes = plt.subplots(1, 2) + df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) + df.plot.scatter("A label", "B label", c="C label", ax=axes[1]) + plt.tight_layout() + + points = np.array([ax.get_position().get_points() for ax in fig.axes]) + axes_x_coords = points[:, :, 0] + parent_distance = axes_x_coords[1, :] - axes_x_coords[0, :] + colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] + assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() + + @pytest.mark.parametrize("cmap", [None, "Greys"]) + def test_scatter_with_c_column_name_with_colors(self, cmap): + # https://github.com/pandas-dev/pandas/issues/34316 + df = DataFrame( + [[5.1, 3.5], [4.9, 3.0], [7.0, 3.2], [6.4, 3.2], [5.9, 3.0]], + columns=["length", "width"], + ) + df["species"] = ["r", "r", "g", "g", "b"] + ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap) + assert ax.collections[0].colorbar is None + + def test_scatter_colors(self): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) + with pytest.raises(TypeError): + df.plot.scatter(x="a", y="b", c="c", color="green") + + default_colors = self._unpack_cycler(self.plt.rcParams) + + ax = df.plot.scatter(x="a", y="b", c="c") + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array(self.colorconverter.to_rgba(default_colors[0])), + ) + + ax = df.plot.scatter(x="a", y="b", color="white") + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array([1, 1, 1, 1], dtype=np.float64), + ) + + def test_scatter_colorbar_different_cmap(self): + # GH 33389 + import matplotlib.pyplot as plt + + df = DataFrame({"x": [1, 2, 3], "y": [1, 3, 2], "c": [1, 2, 3]}) + df["x2"] = df["x"] + 1 + + fig, ax = plt.subplots() + df.plot("x", "y", c="c", kind="scatter", cmap="cividis", ax=ax) + df.plot("x2", "y", c="c", kind="scatter", cmap="magma", ax=ax) + + assert ax.collections[0].cmap.name == "cividis" + assert ax.collections[1].cmap.name == "magma" + + @pytest.mark.slow + def test_line_colors(self): + from matplotlib import cm + + custom_colors = "rgcby" + df = DataFrame(np.random.randn(5, 5)) + + ax = df.plot(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + + tm.close() + + ax2 = df.plot(color=custom_colors) + lines2 = ax2.get_lines() + + for l1, l2 in zip(ax.get_lines(), lines2): + assert l1.get_color() == l2.get_color() + + tm.close() + + ax = df.plot(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + ax = df.plot(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') + ax = df.loc[:, [0]].plot(color="DodgerBlue") + self._check_colors(ax.lines, linecolors=["DodgerBlue"]) + + ax = df.plot(color="red") + self._check_colors(ax.get_lines(), linecolors=["red"] * 5) + tm.close() + + # GH 10299 + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] + ax = df.plot(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + tm.close() + + @pytest.mark.slow + def test_dont_modify_colors(self): + colors = ["r", "g", "b"] + DataFrame(np.random.rand(10, 2)).plot(color=colors) + assert len(colors) == 3 + + @pytest.mark.slow + def test_line_colors_and_styles_subplots(self): + # GH 9894 + from matplotlib import cm + + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(np.random.randn(5, 5)) + + axes = df.plot(subplots=True) + for ax, c in zip(axes, list(default_colors)): + c = [c] + self._check_colors(ax.get_lines(), linecolors=c) + tm.close() + + # single color char + axes = df.plot(subplots=True, color="k") + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["k"]) + tm.close() + + # single color str + axes = df.plot(subplots=True, color="green") + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["green"]) + tm.close() + + custom_colors = "rgcby" + axes = df.plot(color=custom_colors, subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + axes = df.plot(color=list(custom_colors), subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # GH 10299 + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] + axes = df.plot(color=custom_colors, subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + for cmap in ["jet", cm.jet]: + axes = df.plot(colormap=cmap, subplots=True) + for ax, c in zip(axes, rgba_colors): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') + axes = df.loc[:, [0]].plot(color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) + + # single character style + axes = df.plot(style="r", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["r"]) + tm.close() + + # list of styles + styles = list("rgcby") + axes = df.plot(style=styles, subplots=True) + for ax, c in zip(axes, styles): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + @pytest.mark.slow + def test_area_colors(self): + from matplotlib import cm + from matplotlib.collections import PolyCollection + + custom_colors = "rgcby" + df = DataFrame(np.random.rand(5, 5)) + + ax = df.plot.area(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + self._check_colors(poly, facecolors=custom_colors) + + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=custom_colors) + + for h in handles: + assert h.get_alpha() is None + tm.close() + + ax = df.plot.area(colormap="jet") + jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=jet_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + self._check_colors(poly, facecolors=jet_colors) + + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=jet_colors) + for h in handles: + assert h.get_alpha() is None + tm.close() + + # When stacked=False, alpha is set to 0.5 + ax = df.plot.area(colormap=cm.jet, stacked=False) + self._check_colors(ax.get_lines(), linecolors=jet_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] + self._check_colors(poly, facecolors=jet_with_alpha) + + handles, labels = ax.get_legend_handles_labels() + linecolors = jet_with_alpha + self._check_colors(handles[: len(jet_colors)], linecolors=linecolors) + for h in handles: + assert h.get_alpha() == 0.5 + + @pytest.mark.slow + def test_hist_colors(self): + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(np.random.randn(5, 5)) + ax = df.plot.hist() + self._check_colors(ax.patches[::10], facecolors=default_colors[:5]) + tm.close() + + custom_colors = "rgcby" + ax = df.plot.hist(color=custom_colors) + self._check_colors(ax.patches[::10], facecolors=custom_colors) + tm.close() + + from matplotlib import cm + + # Test str -> colormap functionality + ax = df.plot.hist(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::10], facecolors=rgba_colors) + tm.close() + + # Test colormap functionality + ax = df.plot.hist(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::10], facecolors=rgba_colors) + tm.close() + + ax = df.loc[:, [0]].plot.hist(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) + + ax = df.plot(kind="hist", color="green") + self._check_colors(ax.patches[::10], facecolors=["green"] * 5) + tm.close() + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_colors(self): + from matplotlib import cm + + custom_colors = "rgcby" + df = DataFrame(np.random.rand(5, 5)) + + ax = df.plot.kde(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + tm.close() + + ax = df.plot.kde(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + ax = df.plot.kde(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_colors_and_styles_subplots(self): + from matplotlib import cm + + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(np.random.randn(5, 5)) + + axes = df.plot(kind="kde", subplots=True) + for ax, c in zip(axes, list(default_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # single color char + axes = df.plot(kind="kde", color="k", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["k"]) + tm.close() + + # single color str + axes = df.plot(kind="kde", color="red", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["red"]) + tm.close() + + custom_colors = "rgcby" + axes = df.plot(kind="kde", color=custom_colors, subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + for cmap in ["jet", cm.jet]: + axes = df.plot(kind="kde", colormap=cmap, subplots=True) + for ax, c in zip(axes, rgba_colors): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') + axes = df.loc[:, [0]].plot(kind="kde", color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) + + # single character style + axes = df.plot(kind="kde", style="r", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["r"]) + tm.close() + + # list of styles + styles = list("rgcby") + axes = df.plot(kind="kde", style=styles, subplots=True) + for ax, c in zip(axes, styles): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + @pytest.mark.slow + def test_boxplot_colors(self): + def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): + # TODO: outside this func? + if fliers_c is None: + fliers_c = "k" + self._check_colors(bp["boxes"], linecolors=[box_c] * len(bp["boxes"])) + self._check_colors( + bp["whiskers"], linecolors=[whiskers_c] * len(bp["whiskers"]) + ) + self._check_colors( + bp["medians"], linecolors=[medians_c] * len(bp["medians"]) + ) + self._check_colors(bp["fliers"], linecolors=[fliers_c] * len(bp["fliers"])) + self._check_colors(bp["caps"], linecolors=[caps_c] * len(bp["caps"])) + + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(np.random.randn(5, 5)) + bp = df.plot.box(return_type="dict") + _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) + tm.close() + + dict_colors = dict( + boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456" + ) + bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict") + _check_colors( + bp, + dict_colors["boxes"], + dict_colors["whiskers"], + dict_colors["medians"], + dict_colors["caps"], + "r", + ) + tm.close() + + # partial colors + dict_colors = dict(whiskers="c", medians="m") + bp = df.plot.box(color=dict_colors, return_type="dict") + _check_colors(bp, default_colors[0], "c", "m") + tm.close() + + from matplotlib import cm + + # Test str -> colormap functionality + bp = df.plot.box(colormap="jet", return_type="dict") + jet_colors = [cm.jet(n) for n in np.linspace(0, 1, 3)] + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # Test colormap functionality + bp = df.plot.box(colormap=cm.jet, return_type="dict") + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # string color is applied to all artists except fliers + bp = df.plot.box(color="DodgerBlue", return_type="dict") + _check_colors(bp, "DodgerBlue", "DodgerBlue", "DodgerBlue", "DodgerBlue") + + # tuple is also applied to all artists except fliers + bp = df.plot.box(color=(0, 1, 0), sym="#123456", return_type="dict") + _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), "#123456") + + with pytest.raises(ValueError): + # Color contains invalid key results in ValueError + df.plot.box(color=dict(boxes="red", xxxx="blue")) + + def test_default_color_cycle(self): + import cycler + import matplotlib.pyplot as plt + + colors = list("rgbk") + plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) + + df = DataFrame(np.random.randn(5, 3)) + ax = df.plot() + + expected = self._unpack_cycler(plt.rcParams)[:3] + self._check_colors(ax.get_lines(), linecolors=expected) + + @pytest.mark.slow + def test_no_color_bar(self): + df = self.hexbin_df + + ax = df.plot.hexbin(x="A", y="B", colorbar=None) + assert ax.collections[0].colorbar is None + + def test_invalid_colormap(self): + df = DataFrame(np.random.randn(3, 2), columns=["A", "B"]) + + with pytest.raises(ValueError): + df.plot(colormap="invalid_colormap") + + def test_passed_bar_colors(self): + import matplotlib as mpl + + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] + colormap = mpl.colors.ListedColormap(color_tuples) + barplot = DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) + assert color_tuples == [c.get_facecolor() for c in barplot.patches] + + def test_rcParams_bar_colors(self): + import matplotlib as mpl + + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] + with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): + barplot = DataFrame([[1, 2, 3]]).plot(kind="bar") + assert color_tuples == [c.get_facecolor() for c in barplot.patches] + + def test_colors_of_columns_with_same_name(self): + # ISSUE 11136 -> https://github.com/pandas-dev/pandas/issues/11136 + # Creating a DataFrame with duplicate column labels and testing colors of them. + df = DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) + df1 = DataFrame({"a": [2, 4, 6]}) + df_concat = pd.concat([df, df1], axis=1) + result = df_concat.plot() + for legend, line in zip(result.get_legend().legendHandles, result.lines): + assert legend.get_color() == line.get_color() diff --git a/pandas/tests/plotting/frame/test_frame_groupby.py b/pandas/tests/plotting/frame/test_frame_groupby.py new file mode 100644 index 0000000000000..92ae025145595 --- /dev/null +++ b/pandas/tests/plotting/frame/test_frame_groupby.py @@ -0,0 +1,95 @@ +""" Test cases for DataFrame.plot """ + +import numpy as np + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase + + +@td.skip_if_no_mpl +class TestDataFramePlotsGroupby(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + + self.tdf = tm.makeTimeDataFrame() + self.hexbin_df = DataFrame( + { + "A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20), + } + ) + + def _assert_ytickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_yticklabels(), visible=exp) + + def _assert_xtickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_xticklabels(), visible=exp) + + def test_groupby_boxplot_sharey(self): + # https://github.com/pandas-dev/pandas/issues/20968 + # sharey can now be switched check whether the right + # pair of axes is turned on or off + + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) + + # behavior without keyword + axes = df.groupby("c").boxplot() + expected = [True, False, True, False] + self._assert_ytickslabels_visibility(axes, expected) + + # set sharey=True should be identical + axes = df.groupby("c").boxplot(sharey=True) + expected = [True, False, True, False] + self._assert_ytickslabels_visibility(axes, expected) + + # sharey=False, all yticklabels should be visible + axes = df.groupby("c").boxplot(sharey=False) + expected = [True, True, True, True] + self._assert_ytickslabels_visibility(axes, expected) + + def test_groupby_boxplot_sharex(self): + # https://github.com/pandas-dev/pandas/issues/20968 + # sharex can now be switched check whether the right + # pair of axes is turned on or off + + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) + + # behavior without keyword + axes = df.groupby("c").boxplot() + expected = [True, True, True, True] + self._assert_xtickslabels_visibility(axes, expected) + + # set sharex=False should be identical + axes = df.groupby("c").boxplot(sharex=False) + expected = [True, True, True, True] + self._assert_xtickslabels_visibility(axes, expected) + + # sharex=True, yticklabels should be visible + # only for bottom plots + axes = df.groupby("c").boxplot(sharex=True) + expected = [False, False, True, True] + self._assert_xtickslabels_visibility(axes, expected) diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py new file mode 100644 index 0000000000000..4c86a570360b0 --- /dev/null +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -0,0 +1,698 @@ +""" Test cases for DataFrame.plot """ + +import string +import warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series, date_range +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase + +from pandas.io.formats.printing import pprint_thing + + +@td.skip_if_no_mpl +class TestDataFramePlotsSubplots(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + + self.tdf = tm.makeTimeDataFrame() + self.hexbin_df = DataFrame( + { + "A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20), + } + ) + + def _assert_ytickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_yticklabels(), visible=exp) + + def _assert_xtickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_xticklabels(), visible=exp) + + @pytest.mark.slow + def test_subplots(self): + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + + for kind in ["bar", "barh", "line", "area"]: + axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + assert axes.shape == (3,) + + for ax, column in zip(axes, df.columns): + self._check_legend_labels(ax, labels=[pprint_thing(column)]) + + for ax in axes[:-2]: + self._check_visible(ax.xaxis) # xaxis must be visible for grid + self._check_visible(ax.get_xticklabels(), visible=False) + if not (kind == "bar" and self.mpl_ge_3_1_0): + # change https://github.com/pandas-dev/pandas/issues/26714 + self._check_visible(ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.xaxis.get_label(), visible=False) + self._check_visible(ax.get_yticklabels()) + + self._check_visible(axes[-1].xaxis) + self._check_visible(axes[-1].get_xticklabels()) + self._check_visible(axes[-1].get_xticklabels(minor=True)) + self._check_visible(axes[-1].xaxis.get_label()) + self._check_visible(axes[-1].get_yticklabels()) + + axes = df.plot(kind=kind, subplots=True, sharex=False) + for ax in axes: + self._check_visible(ax.xaxis) + self._check_visible(ax.get_xticklabels()) + self._check_visible(ax.get_xticklabels(minor=True)) + self._check_visible(ax.xaxis.get_label()) + self._check_visible(ax.get_yticklabels()) + + axes = df.plot(kind=kind, subplots=True, legend=False) + for ax in axes: + assert ax.get_legend() is None + + @pytest.mark.slow + def test_subplots_timeseries(self): + idx = date_range(start="2014-07-01", freq="M", periods=10) + df = DataFrame(np.random.rand(10, 3), index=idx) + + for kind in ["line", "area"]: + axes = df.plot(kind=kind, subplots=True, sharex=True) + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + + for ax in axes[:-2]: + # GH 7801 + self._check_visible(ax.xaxis) # xaxis must be visible for grid + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.xaxis.get_label(), visible=False) + self._check_visible(ax.get_yticklabels()) + + self._check_visible(axes[-1].xaxis) + self._check_visible(axes[-1].get_xticklabels()) + self._check_visible(axes[-1].get_xticklabels(minor=True)) + self._check_visible(axes[-1].xaxis.get_label()) + self._check_visible(axes[-1].get_yticklabels()) + self._check_ticks_props(axes, xrot=0) + + axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) + for ax in axes: + self._check_visible(ax.xaxis) + self._check_visible(ax.get_xticklabels()) + self._check_visible(ax.get_xticklabels(minor=True)) + self._check_visible(ax.xaxis.get_label()) + self._check_visible(ax.get_yticklabels()) + self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) + + def test_subplots_timeseries_y_axis(self): + # GH16953 + data = { + "numeric": np.array([1, 2, 5]), + "timedelta": [ + pd.Timedelta(-10, unit="s"), + pd.Timedelta(10, unit="m"), + pd.Timedelta(10, unit="h"), + ], + "datetime_no_tz": [ + pd.to_datetime("2017-08-01 00:00:00"), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + "datetime_all_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00", utc=True), + pd.to_datetime("2017-08-02 00:00:00", utc=True), + ], + "text": ["This", "should", "fail"], + } + testdata = DataFrame(data) + + ax_numeric = testdata.plot(y="numeric") + assert ( + ax_numeric.get_lines()[0].get_data()[1] == testdata["numeric"].values + ).all() + ax_timedelta = testdata.plot(y="timedelta") + assert ( + ax_timedelta.get_lines()[0].get_data()[1] == testdata["timedelta"].values + ).all() + ax_datetime_no_tz = testdata.plot(y="datetime_no_tz") + assert ( + ax_datetime_no_tz.get_lines()[0].get_data()[1] + == testdata["datetime_no_tz"].values + ).all() + ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") + assert ( + ax_datetime_all_tz.get_lines()[0].get_data()[1] + == testdata["datetime_all_tz"].values + ).all() + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + testdata.plot(y="text") + + @pytest.mark.xfail(reason="not support for period, categorical, datetime_mixed_tz") + def test_subplots_timeseries_y_axis_not_supported(self): + """ + This test will fail for: + period: + since period isn't yet implemented in ``select_dtypes`` + and because it will need a custom value converter + + tick formatter (as was done for x-axis plots) + + categorical: + because it will need a custom value converter + + tick formatter (also doesn't work for x-axis, as of now) + + datetime_mixed_tz: + because of the way how pandas handles ``Series`` of + ``datetime`` objects with different timezone, + generally converting ``datetime`` objects in a tz-aware + form could help with this problem + """ + data = { + "numeric": np.array([1, 2, 5]), + "period": [ + pd.Period("2017-08-01 00:00:00", freq="H"), + pd.Period("2017-08-01 02:00", freq="H"), + pd.Period("2017-08-02 00:00:00", freq="H"), + ], + "categorical": pd.Categorical( + ["c", "b", "a"], categories=["a", "b", "c"], ordered=False + ), + "datetime_mixed_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + } + testdata = DataFrame(data) + ax_period = testdata.plot(x="numeric", y="period") + assert ( + ax_period.get_lines()[0].get_data()[1] == testdata["period"].values + ).all() + ax_categorical = testdata.plot(x="numeric", y="categorical") + assert ( + ax_categorical.get_lines()[0].get_data()[1] + == testdata["categorical"].values + ).all() + ax_datetime_mixed_tz = testdata.plot(x="numeric", y="datetime_mixed_tz") + assert ( + ax_datetime_mixed_tz.get_lines()[0].get_data()[1] + == testdata["datetime_mixed_tz"].values + ).all() + + @pytest.mark.slow + def test_subplots_layout(self): + # GH 6667 + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + + axes = df.plot(subplots=True, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + assert axes.shape == (2, 2) + + axes = df.plot(subplots=True, layout=(-1, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + assert axes.shape == (2, 2) + + axes = df.plot(subplots=True, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + assert axes.shape == (2, 2) + + axes = df.plot(subplots=True, layout=(1, 4)) + self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) + assert axes.shape == (1, 4) + + axes = df.plot(subplots=True, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) + assert axes.shape == (1, 4) + + axes = df.plot(subplots=True, layout=(4, -1)) + self._check_axes_shape(axes, axes_num=3, layout=(4, 1)) + assert axes.shape == (4, 1) + + with pytest.raises(ValueError): + df.plot(subplots=True, layout=(1, 1)) + with pytest.raises(ValueError): + df.plot(subplots=True, layout=(-1, -1)) + + # single column + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) + axes = df.plot(subplots=True) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + assert axes.shape == (1,) + + axes = df.plot(subplots=True, layout=(3, 3)) + self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) + assert axes.shape == (3, 3) + + @pytest.mark.slow + def test_subplots_warnings(self): + # GH 9464 + with tm.assert_produces_warning(None): + df = DataFrame(np.random.randn(100, 4)) + df.plot(subplots=True, layout=(3, 2)) + + df = DataFrame( + np.random.randn(100, 4), index=date_range("1/1/2000", periods=100) + ) + df.plot(subplots=True, layout=(3, 2)) + + @pytest.mark.slow + def test_subplots_multiple_axes(self): + # GH 5353, 6970, GH 7069 + fig, axes = self.plt.subplots(2, 3) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + + returned = df.plot(subplots=True, ax=axes[0], sharex=False, sharey=False) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + assert returned.shape == (3,) + assert returned[0].figure is fig + # draw on second row + returned = df.plot(subplots=True, ax=axes[1], sharex=False, sharey=False) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + assert returned.shape == (3,) + assert returned[0].figure is fig + self._check_axes_shape(axes, axes_num=6, layout=(2, 3)) + tm.close() + + with pytest.raises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + df.plot(subplots=True, ax=axes) + + # pass 2-dim axes and invalid layout + # invalid lauout should not affect to input and return value + # (show warning is tested in + # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes + fig, axes = self.plt.subplots(2, 2) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + df = DataFrame(np.random.rand(10, 4), index=list(string.ascii_letters[:10])) + + returned = df.plot( + subplots=True, ax=axes, layout=(2, 1), sharex=False, sharey=False + ) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + assert returned.shape == (4,) + + returned = df.plot( + subplots=True, ax=axes, layout=(2, -1), sharex=False, sharey=False + ) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + assert returned.shape == (4,) + + returned = df.plot( + subplots=True, ax=axes, layout=(-1, 2), sharex=False, sharey=False + ) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + assert returned.shape == (4,) + + # single column + fig, axes = self.plt.subplots(1, 1) + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) + + axes = df.plot(subplots=True, ax=[axes], sharex=False, sharey=False) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + assert axes.shape == (1,) + + def test_subplots_ts_share_axes(self): + # GH 3964 + fig, axes = self.plt.subplots(3, 3, sharex=True, sharey=True) + self.plt.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) + df = DataFrame( + np.random.randn(10, 9), + index=date_range(start="2014-07-01", freq="M", periods=10), + ) + for i, ax in enumerate(axes.ravel()): + df[i].plot(ax=ax, fontsize=5) + + # Rows other than bottom should not be visible + for ax in axes[0:-1].ravel(): + self._check_visible(ax.get_xticklabels(), visible=False) + + # Bottom row should be visible + for ax in axes[-1].ravel(): + self._check_visible(ax.get_xticklabels(), visible=True) + + # First column should be visible + for ax in axes[[0, 1, 2], [0]].ravel(): + self._check_visible(ax.get_yticklabels(), visible=True) + + # Other columns should not be visible + for ax in axes[[0, 1, 2], [1]].ravel(): + self._check_visible(ax.get_yticklabels(), visible=False) + for ax in axes[[0, 1, 2], [2]].ravel(): + self._check_visible(ax.get_yticklabels(), visible=False) + + def test_subplots_sharex_axes_existing_axes(self): + # GH 9158 + d = {"A": [1.0, 2.0, 3.0, 4.0], "B": [4.0, 3.0, 2.0, 1.0], "C": [5, 1, 3, 4]} + df = DataFrame(d, index=date_range("2014 10 11", "2014 10 14")) + + axes = df[["A", "B"]].plot(subplots=True) + df["C"].plot(ax=axes[0], secondary_y=True) + + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + for ax in axes.ravel(): + self._check_visible(ax.get_yticklabels(), visible=True) + + @pytest.mark.slow + def test_subplots_dup_columns(self): + # GH 10962 + df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa")) + axes = df.plot(subplots=True) + for ax in axes: + self._check_legend_labels(ax, labels=["a"]) + assert len(ax.lines) == 1 + tm.close() + + axes = df.plot(subplots=True, secondary_y="a") + for ax in axes: + # (right) is only attached when subplots=False + self._check_legend_labels(ax, labels=["a"]) + assert len(ax.lines) == 1 + tm.close() + + ax = df.plot(secondary_y="a") + self._check_legend_labels(ax, labels=["a (right)"] * 5) + assert len(ax.lines) == 0 + assert len(ax.right_ax.lines) == 5 + + @pytest.mark.slow + def test_bar_subplots_center(self): + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="bar", subplots=True, width=0.9) + self._check_bar_alignment(df, kind="barh", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True, width=0.9) + + @pytest.mark.slow + def test_bar_log_no_subplots(self): + # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 + # regressions in 1.2.1 + expected = np.array([0.1, 1.0, 10.0, 100]) + + # no subplots + df = DataFrame({"A": [3] * 5, "B": list(range(1, 6))}, index=range(5)) + ax = df.plot.bar(grid=True, log=True) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + + @pytest.mark.slow + def test_bar_log_subplots(self): + expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]) + + ax = DataFrame([Series([200, 300]), Series([300, 500])]).plot.bar( + log=True, subplots=True + ) + + tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) + tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) + + @pytest.mark.slow + def test_boxplot_subplots_return_type(self): + df = self.hist_df + + # normal style: return_type=None + result = df.plot.box(subplots=True) + assert isinstance(result, Series) + self._check_box_return_type( + result, None, expected_keys=["height", "weight", "category"] + ) + + for t in ["dict", "axes", "both"]: + returned = df.plot.box(return_type=t, subplots=True) + self._check_box_return_type( + returned, + t, + expected_keys=["height", "weight", "category"], + check_ax_title=False, + ) + + @pytest.mark.slow + def test_df_subplots_patterns_minorticks(self): + # GH 10657 + import matplotlib.pyplot as plt + + df = DataFrame( + np.random.randn(10, 2), + index=date_range("1/1/2000", periods=10), + columns=list("AB"), + ) + + # shared subplots + fig, axes = plt.subplots(2, 1, sharex=True) + axes = df.plot(subplots=True, ax=axes) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + # xaxis of 1st ax must be hidden + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) + tm.close() + + fig, axes = plt.subplots(2, 1) + with tm.assert_produces_warning(UserWarning): + axes = df.plot(subplots=True, ax=axes, sharex=True) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + # xaxis of 1st ax must be hidden + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) + tm.close() + + # not shared + fig, axes = plt.subplots(2, 1) + axes = df.plot(subplots=True, ax=axes) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + def test_subplots_sharex_false(self): + # test when sharex is set to False, two plots should have different + # labels, GH 25160 + df = DataFrame(np.random.rand(10, 2)) + df.iloc[5:, 1] = np.nan + df.iloc[:5, 0] = np.nan + + figs, axs = self.plt.subplots(2, 1) + df.plot.line(ax=axs, subplots=True, sharex=False) + + expected_ax1 = np.arange(4.5, 10, 0.5) + expected_ax2 = np.arange(-0.5, 5, 0.5) + + tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) + tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) + + @pytest.mark.parametrize( + "index_name, old_label, new_label", + [ + (None, "", "new"), + ("old", "old", "new"), + (None, "", ""), + (None, "", 1), + (None, "", [1, 2]), + ], + ) + @pytest.mark.parametrize("kind", ["line", "area", "bar"]) + def test_xlabel_ylabel_dataframe_subplots( + self, kind, index_name, old_label, new_label + ): + # GH 9093 + df = DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) + df.index.name = index_name + + # default is the ylabel is not shown and xlabel is index name + axes = df.plot(kind=kind, subplots=True) + assert all(ax.get_ylabel() == "" for ax in axes) + assert all(ax.get_xlabel() == old_label for ax in axes) + + # old xlabel will be overriden and assigned ylabel will be used as ylabel + axes = df.plot(kind=kind, ylabel=new_label, xlabel=new_label, subplots=True) + assert all(ax.get_ylabel() == str(new_label) for ax in axes) + assert all(ax.get_xlabel() == str(new_label) for ax in axes) + + @pytest.mark.slow + def test_bar_barwidth_position(self): + df = DataFrame(np.random.randn(5, 5)) + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, position=0.2) + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, position=0.2 + ) + + @pytest.mark.slow + def test_bar_barwidth_position_int(self): + # GH 12979 + df = DataFrame(np.random.randn(5, 5)) + + for w in [1, 1.0]: + ax = df.plot.bar(stacked=True, width=w) + ticks = ax.xaxis.get_ticklocs() + tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4])) + assert ax.get_xlim() == (-0.75, 4.75) + # check left-edge of bars + assert ax.patches[0].get_x() == -0.5 + assert ax.patches[-1].get_x() == 3.5 + + self._check_bar_alignment(df, kind="bar", stacked=True, width=1) + self._check_bar_alignment(df, kind="barh", stacked=False, width=1) + self._check_bar_alignment(df, kind="barh", stacked=True, width=1) + self._check_bar_alignment(df, kind="bar", subplots=True, width=1) + self._check_bar_alignment(df, kind="barh", subplots=True, width=1) + + @pytest.mark.slow + def test_bar_stacked_center(self): + # GH2157 + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=True, width=0.9) + + @pytest.mark.slow + def test_bar_center(self): + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=False, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=False, width=0.9) + + @pytest.mark.slow + def test_bar_align_single_column(self): + df = DataFrame(np.random.randn(5)) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True) + + @pytest.mark.slow + def test_bar_edge(self): + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + + self._check_bar_alignment(df, kind="bar", stacked=True, align="edge") + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, align="edge") + self._check_bar_alignment(df, kind="barh", stacked=True, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, align="edge" + ) + + def _check_bar_alignment( + self, + df, + kind="bar", + stacked=False, + subplots=False, + align="center", + width=0.5, + position=0.5, + ): + + axes = df.plot( + kind=kind, + stacked=stacked, + subplots=subplots, + align=align, + width=width, + position=position, + grid=True, + ) + + axes = self._flatten_visible(axes) + + for ax in axes: + if kind == "bar": + axis = ax.xaxis + ax_min, ax_max = ax.get_xlim() + min_edge = min(p.get_x() for p in ax.patches) + max_edge = max(p.get_x() + p.get_width() for p in ax.patches) + elif kind == "barh": + axis = ax.yaxis + ax_min, ax_max = ax.get_ylim() + min_edge = min(p.get_y() for p in ax.patches) + max_edge = max(p.get_y() + p.get_height() for p in ax.patches) + else: + raise ValueError + + # GH 7498 + # compare margins between lim and bar edges + tm.assert_almost_equal(ax_min, min_edge - 0.25) + tm.assert_almost_equal(ax_max, max_edge + 0.25) + + p = ax.patches[0] + if kind == "bar" and (stacked is True or subplots is True): + edge = p.get_x() + center = edge + p.get_width() * position + elif kind == "bar" and stacked is False: + center = p.get_x() + p.get_width() * len(df.columns) * position + edge = p.get_x() + elif kind == "barh" and (stacked is True or subplots is True): + center = p.get_y() + p.get_height() * position + edge = p.get_y() + elif kind == "barh" and stacked is False: + center = p.get_y() + p.get_height() * len(df.columns) * position + edge = p.get_y() + else: + raise ValueError + + # Check the ticks locates on integer + assert (axis.get_ticklocs() == np.arange(len(df))).all() + + if align == "center": + # Check whether the bar locates on center + tm.assert_almost_equal(axis.get_ticklocs()[0], center) + elif align == "edge": + # Check whether the bar's edge starts from the tick + tm.assert_almost_equal(axis.get_ticklocs()[0], edge) + else: + raise ValueError + + return axes diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py new file mode 100644 index 0000000000000..665bda15724fd --- /dev/null +++ b/pandas/tests/plotting/test_style.py @@ -0,0 +1,157 @@ +import pytest + +from pandas import Series + +pytest.importorskip("matplotlib") +from pandas.plotting._matplotlib.style import get_standard_colors + + +class TestGetStandardColors: + @pytest.mark.parametrize( + "num_colors, expected", + [ + (3, ["red", "green", "blue"]), + (5, ["red", "green", "blue", "red", "green"]), + (7, ["red", "green", "blue", "red", "green", "blue", "red"]), + (2, ["red", "green"]), + (1, ["red"]), + ], + ) + def test_default_colors_named_from_prop_cycle(self, num_colors, expected): + import matplotlib as mpl + from matplotlib.pyplot import cycler + + mpl_params = { + "axes.prop_cycle": cycler(color=["red", "green", "blue"]), + } + with mpl.rc_context(rc=mpl_params): + result = get_standard_colors(num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, ["b"]), + (3, ["b", "g", "r"]), + (4, ["b", "g", "r", "y"]), + (5, ["b", "g", "r", "y", "b"]), + (7, ["b", "g", "r", "y", "b", "g", "r"]), + ], + ) + def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected): + import matplotlib as mpl + from matplotlib.pyplot import cycler + + mpl_params = { + "axes.prop_cycle": cycler(color="bgry"), + } + with mpl.rc_context(rc=mpl_params): + result = get_standard_colors(num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected_name", + [ + (1, ["C0"]), + (3, ["C0", "C1", "C2"]), + ( + 12, + [ + "C0", + "C1", + "C2", + "C3", + "C4", + "C5", + "C6", + "C7", + "C8", + "C9", + "C0", + "C1", + ], + ), + ], + ) + def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name): + import matplotlib as mpl + import matplotlib.colors as mcolors + + with mpl.rc_context(rc={}): + expected = [mcolors.to_hex(x) for x in expected_name] + result = get_standard_colors(num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, ["red", "green", (0.1, 0.2, 0.3)]), + (2, ["red", "green", (0.1, 0.2, 0.3)]), + (3, ["red", "green", (0.1, 0.2, 0.3)]), + (4, ["red", "green", (0.1, 0.2, 0.3), "red"]), + ], + ) + def test_user_input_color_sequence(self, num_colors, expected): + color = ["red", "green", (0.1, 0.2, 0.3)] + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, ["r", "g", "b", "k"]), + (2, ["r", "g", "b", "k"]), + (3, ["r", "g", "b", "k"]), + (4, ["r", "g", "b", "k"]), + (5, ["r", "g", "b", "k", "r"]), + (6, ["r", "g", "b", "k", "r", "g"]), + ], + ) + def test_user_input_color_string(self, num_colors, expected): + color = "rgbk" + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, [(0.1, 0.2, 0.3)]), + (2, [(0.1, 0.2, 0.3), (0.1, 0.2, 0.3)]), + (3, [(0.1, 0.2, 0.3), (0.1, 0.2, 0.3), (0.1, 0.2, 0.3)]), + ], + ) + def test_user_input_color_floats(self, num_colors, expected): + color = (0.1, 0.2, 0.3) + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "color, num_colors, expected", + [ + ("Crimson", 1, ["Crimson"]), + ("DodgerBlue", 2, ["DodgerBlue", "DodgerBlue"]), + ("firebrick", 3, ["firebrick", "firebrick", "firebrick"]), + ], + ) + def test_user_input_named_color_string(self, color, num_colors, expected): + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize("color", ["", [], (), Series([], dtype="object")]) + def test_empty_color_raises(self, color): + with pytest.raises(ValueError, match="Invalid color argument"): + get_standard_colors(color=color, num_colors=1) + + @pytest.mark.parametrize( + "color", + [ + "bad_color", + ("red", "green", "bad_color"), + (0.1,), + (0.1, 0.2), + (0.1, 0.2, 0.3, 0.4, 0.5), # must be either 3 or 4 floats + ], + ) + def test_bad_color_raises(self, color): + with pytest.raises(ValueError, match="Invalid color"): + get_standard_colors(color=color, num_colors=5) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index bb2860b88b288..a58372040c7f3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1707,8 +1707,8 @@ def test_other_columns(self, left, right): tm.assert_series_equal(result, expected) # categories are preserved - assert left.X.values.is_dtype_equal(merged.X.values) - assert right.Z.values.is_dtype_equal(merged.Z.values) + assert left.X.values._categories_match_up_to_permutation(merged.X.values) + assert right.Z.values._categories_match_up_to_permutation(merged.Z.values) @pytest.mark.parametrize( "change", @@ -1725,7 +1725,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values.dtype) - # assert not left.X.values.is_dtype_equal(right.X.values) + # assert not left.X.values._categories_match_up_to_permutation(right.X.values) merged = pd.merge(left, right, on="X", how=join_type) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index b1922241c7843..260a0e9d486b2 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp import pandas._testing as tm from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import merge @@ -481,6 +481,53 @@ def test_merge_datetime_index(self, klass): result = df.merge(df, on=[df.index.year], how="inner") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("merge_type", ["left", "right"]) + def test_merge_datetime_multi_index_empty_df(self, merge_type): + # see gh-36895 + + left = DataFrame( + data={ + "data": [1.5, 1.5], + }, + index=MultiIndex.from_tuples( + [[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]], + names=["date", "panel"], + ), + ) + + right = DataFrame( + index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"] + ) + + expected_index = MultiIndex.from_tuples( + [[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]], + names=["date", "panel"], + ) + + if merge_type == "left": + expected = DataFrame( + data={ + "data": [1.5, 1.5], + "state": [None, None], + }, + index=expected_index, + ) + results_merge = left.merge(right, how="left", on=["date", "panel"]) + results_join = left.join(right, how="left") + else: + expected = DataFrame( + data={ + "state": [None, None], + "data": [1.5, 1.5], + }, + index=expected_index, + ) + results_merge = right.merge(left, how="right", on=["date", "panel"]) + results_join = right.join(left, how="right") + + tm.assert_frame_equal(results_merge, expected) + tm.assert_frame_equal(results_join, expected) + def test_join_multi_levels(self): # GH 3662 diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index f150e5e5b18b2..46bc6421c2070 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1554,3 +1554,9 @@ def test_negone_ordinals(): repr(period) period = Period(ordinal=-1, freq="W") repr(period) + + +def test_invalid_frequency_error_message(): + msg = "Invalid frequency: " + with pytest.raises(ValueError, match=msg): + Period("2012-01-02", freq="WOM-1MON") diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index 9096d2a1033e5..93431a5c75091 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -5,12 +5,23 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, isna +from pandas import DataFrame, Index, MultiIndex, Series, isna, timedelta_range import pandas._testing as tm from pandas.core.base import SpecificationError class TestSeriesApply: + def test_series_map_box_timedelta(self): + # GH#11349 + ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) + + def f(x): + return x.total_seconds() + + ser.map(f) + ser.apply(f) + DataFrame(ser).applymap(f) + def test_apply(self, datetime_series): with np.errstate(all="ignore"): tm.assert_series_equal( diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index f8517c3b91fc1..2933983a5b18b 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -1,7 +1,7 @@ """ Series.__getitem__ test classes are organized by the type of key passed. """ -from datetime import datetime +from datetime import datetime, time import numpy as np import pytest @@ -83,6 +83,16 @@ def test_string_index_alias_tz_aware(self, tz): result = ser["1/3/2000"] tm.assert_almost_equal(result, ser[2]) + def test_getitem_time_object(self): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = Series(np.random.randn(len(rng)), index=rng) + + mask = (rng.hour == 9) & (rng.minute == 30) + result = ts[time(9, 30)] + expected = ts[mask] + result.index = result.index._with_freq(None) + tm.assert_series_equal(result, expected) + class TestSeriesGetitemSlices: def test_getitem_slice_2d(self, datetime_series): @@ -271,6 +281,21 @@ def test_getitem_boolean_different_order(self, string_series): exp = string_series[string_series > 0] tm.assert_series_equal(sel, exp) + def test_getitem_boolean_contiguous_preserve_freq(self): + rng = date_range("1/1/2000", "3/1/2000", freq="B") + + mask = np.zeros(len(rng), dtype=bool) + mask[10:20] = True + + masked = rng[mask] + expected = rng[10:20] + assert expected.freq == rng.freq + tm.assert_index_equal(masked, expected) + + mask[22] = True + masked = rng[mask] + assert masked.freq is None + class TestGetitemCallable: def test_getitem_callable(self): diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 4b4ef5ea046be..43d40d53dcd21 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -161,9 +161,6 @@ def test_errors(self): def test_all_nans(self): # GH 15713 # series is all nans - result = Series([np.nan]).asof([0]) - expected = Series([np.nan]) - tm.assert_series_equal(result, expected) # testing non-default indexes N = 50 diff --git a/pandas/tests/series/methods/test_at_time.py b/pandas/tests/series/methods/test_at_time.py deleted file mode 100644 index 810e4c1446708..0000000000000 --- a/pandas/tests/series/methods/test_at_time.py +++ /dev/null @@ -1,77 +0,0 @@ -from datetime import time - -import numpy as np -import pytest - -from pandas._libs.tslibs import timezones - -from pandas import DataFrame, Series, date_range -import pandas._testing as tm - - -class TestAtTime: - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_localized_at_time(self, tzstr): - tz = timezones.maybe_get_tz(tzstr) - - rng = date_range("4/16/2012", "5/1/2012", freq="H") - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_local = ts.tz_localize(tzstr) - - result = ts_local.at_time(time(10, 0)) - expected = ts.at_time(time(10, 0)).tz_localize(tzstr) - tm.assert_series_equal(result, expected) - assert timezones.tz_compare(result.index.tz, tz) - - def test_at_time(self): - rng = date_range("1/1/2000", "1/5/2000", freq="5min") - ts = Series(np.random.randn(len(rng)), index=rng) - rs = ts.at_time(rng[1]) - assert (rs.index.hour == rng[1].hour).all() - assert (rs.index.minute == rng[1].minute).all() - assert (rs.index.second == rng[1].second).all() - - result = ts.at_time("9:30") - expected = ts.at_time(time(9, 30)) - tm.assert_series_equal(result, expected) - - df = DataFrame(np.random.randn(len(rng), 3), index=rng) - - result = ts[time(9, 30)] - result_df = df.loc[time(9, 30)] - expected = ts[(rng.hour == 9) & (rng.minute == 30)] - exp_df = df[(rng.hour == 9) & (rng.minute == 30)] - - result.index = result.index._with_freq(None) - tm.assert_series_equal(result, expected) - tm.assert_frame_equal(result_df, exp_df) - - chunk = df.loc["1/4/2000":] - result = chunk.loc[time(9, 30)] - expected = result_df[-1:] - - # Without resetting the freqs, these are 5 min and 1440 min, respectively - result.index = result.index._with_freq(None) - expected.index = expected.index._with_freq(None) - tm.assert_frame_equal(result, expected) - - # midnight, everything - rng = date_range("1/1/2000", "1/31/2000") - ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts.at_time(time(0, 0)) - tm.assert_series_equal(result, ts) - - # time doesn't exist - rng = date_range("1/1/2012", freq="23Min", periods=384) - ts = Series(np.random.randn(len(rng)), rng) - rs = ts.at_time("16:00") - assert len(rs) == 0 - - def test_at_time_raises(self): - # GH20725 - ser = Series("a b c".split()) - msg = "Index must be DatetimeIndex" - with pytest.raises(TypeError, match=msg): - ser.at_time("00:00") diff --git a/pandas/tests/series/methods/test_between_time.py b/pandas/tests/series/methods/test_between_time.py deleted file mode 100644 index e9d2f8e6f1637..0000000000000 --- a/pandas/tests/series/methods/test_between_time.py +++ /dev/null @@ -1,144 +0,0 @@ -from datetime import datetime, time -from itertools import product - -import numpy as np -import pytest - -from pandas._libs.tslibs import timezones -import pandas.util._test_decorators as td - -from pandas import DataFrame, Series, date_range -import pandas._testing as tm - - -class TestBetweenTime: - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_localized_between_time(self, tzstr): - tz = timezones.maybe_get_tz(tzstr) - - rng = date_range("4/16/2012", "5/1/2012", freq="H") - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_local = ts.tz_localize(tzstr) - - t1, t2 = time(10, 0), time(11, 0) - result = ts_local.between_time(t1, t2) - expected = ts.between_time(t1, t2).tz_localize(tzstr) - tm.assert_series_equal(result, expected) - assert timezones.tz_compare(result.index.tz, tz) - - def test_between_time(self): - rng = date_range("1/1/2000", "1/5/2000", freq="5min") - ts = Series(np.random.randn(len(rng)), index=rng) - stime = time(0, 0) - etime = time(1, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = 13 * 4 + 1 - if not inc_start: - exp_len -= 5 - if not inc_end: - exp_len -= 4 - - assert len(filtered) == exp_len - for rs in filtered.index: - t = rs.time() - if inc_start: - assert t >= stime - else: - assert t > stime - - if inc_end: - assert t <= etime - else: - assert t < etime - - result = ts.between_time("00:00", "01:00") - expected = ts.between_time(stime, etime) - tm.assert_series_equal(result, expected) - - # across midnight - rng = date_range("1/1/2000", "1/5/2000", freq="5min") - ts = Series(np.random.randn(len(rng)), index=rng) - stime = time(22, 0) - etime = time(9, 0) - - close_open = product([True, False], [True, False]) - for inc_start, inc_end in close_open: - filtered = ts.between_time(stime, etime, inc_start, inc_end) - exp_len = (12 * 11 + 1) * 4 + 1 - if not inc_start: - exp_len -= 4 - if not inc_end: - exp_len -= 4 - - assert len(filtered) == exp_len - for rs in filtered.index: - t = rs.time() - if inc_start: - assert (t >= stime) or (t <= etime) - else: - assert (t > stime) or (t <= etime) - - if inc_end: - assert (t <= etime) or (t >= stime) - else: - assert (t < etime) or (t >= stime) - - def test_between_time_raises(self): - # GH20725 - ser = Series("a b c".split()) - msg = "Index must be DatetimeIndex" - with pytest.raises(TypeError, match=msg): - ser.between_time(start_time="00:00", end_time="12:00") - - def test_between_time_types(self): - # GH11818 - rng = date_range("1/1/2000", "1/5/2000", freq="5min") - msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" - with pytest.raises(ValueError, match=msg): - rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - - frame = DataFrame({"A": 0}, index=rng) - with pytest.raises(ValueError, match=msg): - frame.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - - series = Series(0, index=rng) - with pytest.raises(ValueError, match=msg): - series.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - - @td.skip_if_has_locale - def test_between_time_formats(self): - # GH11818 - rng = date_range("1/1/2000", "1/5/2000", freq="5min") - ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - - strings = [ - ("2:00", "2:30"), - ("0200", "0230"), - ("2:00am", "2:30am"), - ("0200am", "0230am"), - ("2:00:00", "2:30:00"), - ("020000", "023000"), - ("2:00:00am", "2:30:00am"), - ("020000am", "023000am"), - ] - expected_length = 28 - - for time_string in strings: - assert len(ts.between_time(*time_string)) == expected_length - - def test_between_time_axis(self): - # issue 8839 - rng = date_range("1/1/2000", periods=100, freq="10min") - ts = Series(np.random.randn(len(rng)), index=rng) - stime, etime = ("08:00:00", "09:00:00") - expected_length = 7 - - assert len(ts.between_time(stime, etime)) == expected_length - assert len(ts.between_time(stime, etime, axis=0)) == expected_length - msg = "No axis named 1 for object type Series" - with pytest.raises(ValueError, match=msg): - ts.between_time(stime, etime, axis=1) diff --git a/pandas/tests/series/methods/test_droplevel.py b/pandas/tests/series/methods/test_droplevel.py deleted file mode 100644 index 449ddd1cd0e49..0000000000000 --- a/pandas/tests/series/methods/test_droplevel.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -from pandas import MultiIndex, Series -import pandas._testing as tm - - -class TestDropLevel: - def test_droplevel(self): - # GH#20342 - ser = Series([1, 2, 3, 4]) - ser.index = MultiIndex.from_arrays( - [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] - ) - expected = ser.reset_index("b", drop=True) - result = ser.droplevel("b", axis="index") - tm.assert_series_equal(result, expected) - # test that droplevel raises ValueError on axis != 0 - with pytest.raises(ValueError, match="No axis named columns"): - ser.droplevel(1, axis="columns") diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index d45486b9bdb29..aaa58cdb390f7 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -653,14 +653,14 @@ def test_fillna_categorical_raises(self): data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b"])) - msg = "'fill_value=d' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): ser.fillna("d") - with pytest.raises(ValueError, match="fill value must be in categories"): + with pytest.raises(ValueError, match=msg): ser.fillna(Series("d")) - with pytest.raises(ValueError, match="fill value must be in categories"): + with pytest.raises(ValueError, match=msg): ser.fillna({1: "d", 3: "a"}) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' diff --git a/pandas/tests/series/methods/test_first_and_last.py b/pandas/tests/series/methods/test_first_and_last.py deleted file mode 100644 index 7629dc8cda30b..0000000000000 --- a/pandas/tests/series/methods/test_first_and_last.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -Note: includes tests for `last` -""" - -import numpy as np -import pytest - -from pandas import Series, date_range -import pandas._testing as tm - - -class TestFirst: - def test_first_subset(self): - rng = date_range("1/1/2000", "1/1/2010", freq="12h") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.first("10d") - assert len(result) == 20 - - rng = date_range("1/1/2000", "1/1/2010", freq="D") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.first("10d") - assert len(result) == 10 - - result = ts.first("3M") - expected = ts[:"3/31/2000"] - tm.assert_series_equal(result, expected) - - result = ts.first("21D") - expected = ts[:21] - tm.assert_series_equal(result, expected) - - result = ts[:0].first("3M") - tm.assert_series_equal(result, ts[:0]) - - def test_first_raises(self): - # GH#20725 - ser = Series("a b c".split()) - msg = "'first' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): - ser.first("1D") - - def test_last_subset(self): - rng = date_range("1/1/2000", "1/1/2010", freq="12h") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.last("10d") - assert len(result) == 20 - - rng = date_range("1/1/2000", "1/1/2010", freq="D") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.last("10d") - assert len(result) == 10 - - result = ts.last("21D") - expected = ts["12/12/2009":] - tm.assert_series_equal(result, expected) - - result = ts.last("21D") - expected = ts[-21:] - tm.assert_series_equal(result, expected) - - result = ts[:0].last("3M") - tm.assert_series_equal(result, ts[:0]) - - def test_last_raises(self): - # GH#20725 - ser = Series("a b c".split()) - msg = "'last' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): - ser.last("1D") diff --git a/pandas/tests/series/methods/test_is_monotonic.py b/pandas/tests/series/methods/test_is_monotonic.py new file mode 100644 index 0000000000000..b242b293cb59e --- /dev/null +++ b/pandas/tests/series/methods/test_is_monotonic.py @@ -0,0 +1,25 @@ +import numpy as np + +from pandas import Series, date_range + + +class TestIsMonotonic: + def test_is_monotonic_numeric(self): + + ser = Series(np.random.randint(0, 10, size=1000)) + assert not ser.is_monotonic + ser = Series(np.arange(1000)) + assert ser.is_monotonic is True + assert ser.is_monotonic_increasing is True + ser = Series(np.arange(1000, 0, -1)) + assert ser.is_monotonic_decreasing is True + + def test_is_monotonic_dt64(self): + + ser = Series(date_range("20130101", periods=10)) + assert ser.is_monotonic is True + assert ser.is_monotonic_increasing is True + + ser = Series(list(reversed(ser))) + assert ser.is_monotonic is False + assert ser.is_monotonic_decreasing is True diff --git a/pandas/tests/series/indexing/test_pop.py b/pandas/tests/series/methods/test_pop.py similarity index 100% rename from pandas/tests/series/indexing/test_pop.py rename to pandas/tests/series/methods/test_pop.py diff --git a/pandas/tests/series/methods/test_set_name.py b/pandas/tests/series/methods/test_set_name.py new file mode 100644 index 0000000000000..cbc8ebde7a8ab --- /dev/null +++ b/pandas/tests/series/methods/test_set_name.py @@ -0,0 +1,21 @@ +from datetime import datetime + +from pandas import Series + + +class TestSetName: + def test_set_name(self): + ser = Series([1, 2, 3]) + ser2 = ser._set_name("foo") + assert ser2.name == "foo" + assert ser.name is None + assert ser is not ser2 + + def test_set_name_attribute(self): + ser = Series([1, 2, 3]) + ser2 = Series([1, 2, 3], name="bar") + for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: + ser.name = name + assert ser.name == name + ser2.name = name + assert ser2.name == name diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index a72e860340f25..714173158f4d6 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -143,11 +143,11 @@ def test_to_csv_compression(self, s, encoding, compression): tm.assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = get_handle( + handles = get_handle( filename, "w", compression=compression, encoding=encoding ) - with f: - s.to_csv(f, encoding=encoding, header=True) + s.to_csv(handles.handle, encoding=encoding, header=True) + handles.close() result = pd.read_csv( filename, compression=compression, diff --git a/pandas/tests/series/methods/test_to_period.py b/pandas/tests/series/methods/test_to_period.py deleted file mode 100644 index b40fc81931e20..0000000000000 --- a/pandas/tests/series/methods/test_to_period.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import pytest - -from pandas import ( - DataFrame, - DatetimeIndex, - PeriodIndex, - Series, - date_range, - period_range, -) -import pandas._testing as tm - - -class TestToPeriod: - def test_to_period(self): - rng = date_range("1/1/2000", "1/1/2001", freq="D") - ts = Series(np.random.randn(len(rng)), index=rng) - - pts = ts.to_period() - exp = ts.copy() - exp.index = period_range("1/1/2000", "1/1/2001") - tm.assert_series_equal(pts, exp) - - pts = ts.to_period("M") - exp.index = exp.index.asfreq("M") - tm.assert_index_equal(pts.index, exp.index.asfreq("M")) - tm.assert_series_equal(pts, exp) - - # GH#7606 without freq - idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) - exp_idx = PeriodIndex( - ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" - ) - - s = Series(np.random.randn(4), index=idx) - expected = s.copy() - expected.index = exp_idx - tm.assert_series_equal(s.to_period(), expected) - - df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) - expected = df.copy() - expected.index = exp_idx - tm.assert_frame_equal(df.to_period(), expected) - - expected = df.copy() - expected.columns = exp_idx - tm.assert_frame_equal(df.to_period(axis=1), expected) - - def test_to_period_raises(self, index): - # https://github.com/pandas-dev/pandas/issues/33327 - ser = Series(index=index, dtype=object) - if not isinstance(index, DatetimeIndex): - msg = f"unsupported Type {type(index).__name__}" - with pytest.raises(TypeError, match=msg): - ser.to_period() diff --git a/pandas/tests/series/methods/test_to_timestamp.py b/pandas/tests/series/methods/test_to_timestamp.py deleted file mode 100644 index 13a2042a2f639..0000000000000 --- a/pandas/tests/series/methods/test_to_timestamp.py +++ /dev/null @@ -1,64 +0,0 @@ -from datetime import timedelta - -import pytest - -from pandas import PeriodIndex, Series, Timedelta, date_range, period_range, to_datetime -import pandas._testing as tm - - -class TestToTimestamp: - def test_to_timestamp(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") - series = Series(1, index=index, name="foo") - - exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") - result = series.to_timestamp(how="end") - exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") - tm.assert_index_equal(result.index, exp_index) - assert result.name == "foo" - - exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") - result = series.to_timestamp(how="start") - tm.assert_index_equal(result.index, exp_index) - - def _get_with_delta(delta, freq="A-DEC"): - return date_range( - to_datetime("1/1/2001") + delta, - to_datetime("12/31/2009") + delta, - freq=freq, - ) - - delta = timedelta(hours=23) - result = series.to_timestamp("H", "end") - exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") - tm.assert_index_equal(result.index, exp_index) - - delta = timedelta(hours=23, minutes=59) - result = series.to_timestamp("T", "end") - exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") - tm.assert_index_equal(result.index, exp_index) - - result = series.to_timestamp("S", "end") - delta = timedelta(hours=23, minutes=59, seconds=59) - exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") - tm.assert_index_equal(result.index, exp_index) - - index = period_range(freq="H", start="1/1/2001", end="1/2/2001") - series = Series(1, index=index, name="foo") - - exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") - result = series.to_timestamp(how="end") - exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") - tm.assert_index_equal(result.index, exp_index) - assert result.name == "foo" - - def test_to_timestamp_raises(self, index): - # https://github.com/pandas-dev/pandas/issues/33327 - ser = Series(index=index, dtype=object) - if not isinstance(index, PeriodIndex): - msg = f"unsupported Type {type(index).__name__}" - with pytest.raises(TypeError, match=msg): - ser.to_timestamp() diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index b03f516eeffc5..21de593c0e2af 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -1,102 +1,11 @@ from datetime import datetime -import numpy as np -import pytest - import pandas as pd from pandas import Series, date_range import pandas._testing as tm -from pandas.tseries.offsets import BDay - class TestTruncate: - def test_truncate(self, datetime_series): - offset = BDay() - - ts = datetime_series[::3] - - start, end = datetime_series.index[3], datetime_series.index[6] - start_missing, end_missing = datetime_series.index[2], datetime_series.index[7] - - # neither specified - truncated = ts.truncate() - tm.assert_series_equal(truncated, ts) - - # both specified - expected = ts[1:3] - - truncated = ts.truncate(start, end) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(start_missing, end_missing) - tm.assert_series_equal(truncated, expected) - - # start specified - expected = ts[1:] - - truncated = ts.truncate(before=start) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(before=start_missing) - tm.assert_series_equal(truncated, expected) - - # end specified - expected = ts[:3] - - truncated = ts.truncate(after=end) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(after=end_missing) - tm.assert_series_equal(truncated, expected) - - # corner case, empty series returned - truncated = ts.truncate(after=datetime_series.index[0] - offset) - assert len(truncated) == 0 - - truncated = ts.truncate(before=datetime_series.index[-1] + offset) - assert len(truncated) == 0 - - msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" - with pytest.raises(ValueError, match=msg): - ts.truncate( - before=datetime_series.index[-1] + offset, - after=datetime_series.index[0] - offset, - ) - - def test_truncate_nonsortedindex(self): - # GH#17935 - - s = Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) - msg = "truncate requires a sorted index" - - with pytest.raises(ValueError, match=msg): - s.truncate(before=3, after=9) - - rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") - ts = Series(np.random.randn(len(rng)), index=rng) - msg = "truncate requires a sorted index" - - with pytest.raises(ValueError, match=msg): - ts.sort_values(ascending=False).truncate(before="2011-11", after="2011-12") - - @pytest.mark.parametrize( - "before, after, indices", - [(1, 2, [2, 1]), (None, 2, [2, 1, 0]), (1, None, [3, 2, 1])], - ) - @pytest.mark.parametrize("klass", [pd.Int64Index, pd.DatetimeIndex]) - def test_truncate_decreasing_index(self, before, after, indices, klass): - # https://github.com/pandas-dev/pandas/issues/33756 - idx = klass([3, 2, 1, 0]) - if klass is pd.DatetimeIndex: - before = pd.Timestamp(before) if before is not None else None - after = pd.Timestamp(after) if after is not None else None - indices = [pd.Timestamp(i) for i in indices] - values = Series(range(len(idx)), index=idx) - result = values.truncate(before=before, after=after) - expected = values.loc[indices] - tm.assert_series_equal(result, expected) - def test_truncate_datetimeindex_tz(self): # GH 9243 idx = date_range("4/1/2005", "4/30/2005", freq="D", tz="US/Pacific") @@ -133,21 +42,6 @@ def test_truncate_periodindex(self): expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) tm.assert_series_equal(result2, Series([2], index=expected_idx2)) - def test_truncate_multiindex(self): - # GH 34564 - mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) - s1 = Series(range(mi.shape[0]), index=mi, name="col") - result = s1.truncate(before=2, after=3) - - df = pd.DataFrame.from_dict( - {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} - ) - return_value = df.set_index(["L1", "L2"], inplace=True) - assert return_value is None - expected = df.col - - tm.assert_series_equal(result, expected) - def test_truncate_one_element_series(self): # GH 35544 series = Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) diff --git a/pandas/tests/series/methods/test_values.py b/pandas/tests/series/methods/test_values.py new file mode 100644 index 0000000000000..e28a714ea656d --- /dev/null +++ b/pandas/tests/series/methods/test_values.py @@ -0,0 +1,20 @@ +import numpy as np +import pytest + +from pandas import IntervalIndex, Series, period_range +import pandas._testing as tm + + +class TestValues: + @pytest.mark.parametrize( + "data", + [ + period_range("2000", periods=4), + IntervalIndex.from_breaks([1, 2, 3, 4]), + ], + ) + def test_values_object_extension_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/23995 + result = Series(data).values + expected = np.array(data.astype(object)) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/methods/test_view.py b/pandas/tests/series/methods/test_view.py new file mode 100644 index 0000000000000..ccf3aa0d90e6f --- /dev/null +++ b/pandas/tests/series/methods/test_view.py @@ -0,0 +1,18 @@ +from pandas import Series, date_range +import pandas._testing as tm + + +class TestView: + def test_view_tz(self): + # GH#24024 + ser = Series(date_range("2000", periods=4, tz="US/Central")) + result = ser.view("i8") + expected = Series( + [ + 946706400000000000, + 946792800000000000, + 946879200000000000, + 946965600000000000, + ] + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py deleted file mode 100644 index 181d7de43d945..0000000000000 --- a/pandas/tests/series/test_alter_axes.py +++ /dev/null @@ -1,55 +0,0 @@ -from datetime import datetime - -import numpy as np -import pytest - -from pandas import Index, Series -import pandas._testing as tm - - -class TestSeriesAlterAxes: - def test_setindex(self, string_series): - # wrong type - msg = ( - r"Index\(\.\.\.\) must be called with a collection of some " - r"kind, None was passed" - ) - with pytest.raises(TypeError, match=msg): - string_series.index = None - - # wrong length - msg = ( - "Length mismatch: Expected axis has 30 elements, " - "new values have 29 elements" - ) - with pytest.raises(ValueError, match=msg): - string_series.index = np.arange(len(string_series) - 1) - - # works - string_series.index = np.arange(len(string_series)) - assert isinstance(string_series.index, Index) - - # Renaming - - def test_set_name_attribute(self): - s = Series([1, 2, 3]) - s2 = Series([1, 2, 3], name="bar") - for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: - s.name = name - assert s.name == name - s2.name = name - assert s2.name == name - - def test_set_name(self): - s = Series([1, 2, 3]) - s2 = s._set_name("foo") - assert s2.name == "foo" - assert s.name is None - assert s is not s2 - - def test_set_index_makes_timeseries(self): - idx = tm.makeDateIndex(10) - - s = Series(range(10)) - s.index = idx - assert s.index._is_all_dates diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py deleted file mode 100644 index ebb75adde5b13..0000000000000 --- a/pandas/tests/series/test_analytics.py +++ /dev/null @@ -1,23 +0,0 @@ -import numpy as np - -import pandas as pd -from pandas import Series - - -class TestSeriesAnalytics: - def test_is_monotonic(self): - - s = Series(np.random.randint(0, 10, size=1000)) - assert not s.is_monotonic - s = Series(np.arange(1000)) - assert s.is_monotonic is True - assert s.is_monotonic_increasing is True - s = Series(np.arange(1000, 0, -1)) - assert s.is_monotonic_decreasing is True - - s = Series(pd.date_range("20130101", periods=10)) - assert s.is_monotonic is True - assert s.is_monotonic_increasing is True - s = Series(list(reversed(s.tolist()))) - assert s.is_monotonic is False - assert s.is_monotonic_decreasing is True diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 4920796f661fb..fa8f85178ba9f 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -15,6 +15,7 @@ Index, IntervalIndex, Series, + Timedelta, bdate_range, date_range, isna, @@ -277,6 +278,25 @@ def test_alignment_doesnt_change_tz(self): assert ser.index is dti assert ser_utc.index is dti_utc + def test_arithmetic_with_duplicate_index(self): + + # GH#8363 + # integer ops with a non-unique index + index = [2, 2, 3, 3, 4] + ser = Series(np.arange(1, 6, dtype="int64"), index=index) + other = Series(np.arange(5, dtype="int64"), index=index) + result = ser - other + expected = Series(1, index=[2, 2, 3, 3, 4]) + tm.assert_series_equal(result, expected) + + # GH#8363 + # datetime ops with a non-unique index + ser = Series(date_range("20130101 09:00:00", periods=5), index=index) + other = Series(date_range("20130101", periods=5), index=index) + result = ser - other + expected = Series(Timedelta("9 hours"), index=[2, 2, 3, 3, 4]) + tm.assert_series_equal(result, expected) + # ------------------------------------------------------------------ # Comparisons @@ -710,6 +730,21 @@ def test_datetime_understood(self): expected = Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) tm.assert_series_equal(result, expected) + def test_align_date_objects_with_datetimeindex(self): + rng = date_range("1/1/2000", periods=20) + ts = Series(np.random.randn(20), index=rng) + + ts_slice = ts[5:] + ts2 = ts_slice.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts + ts2 + result2 = ts2 + ts + expected = ts + ts[5:] + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + @pytest.mark.parametrize( "names", @@ -731,17 +766,23 @@ def test_series_ops_name_retention(flex, box, names, all_binary_operators): left = Series(range(10), name=names[0]) right = Series(range(10), name=names[1]) + name = op.__name__.strip("_") + is_logical = name in ["and", "rand", "xor", "rxor", "or", "ror"] + is_rlogical = is_logical and name.startswith("r") + right = box(right) if flex: - name = op.__name__.strip("_") - if name in ["and", "rand", "xor", "rxor", "or", "ror"]: + if is_logical: # Series doesn't have these as flex methods return result = getattr(left, name)(right) else: - result = op(left, right) + # GH#37374 logical ops behaving as set ops deprecated + warn = FutureWarning if is_rlogical and box is Index else None + with tm.assert_produces_warning(warn, check_stacklevel=False): + result = op(left, right) - if box is pd.Index and op.__name__.strip("_") in ["rxor", "ror", "rand"]: + if box is pd.Index and is_rlogical: # Index treats these as set operators, so does not defer assert isinstance(result, pd.Index) return diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5c4118bc40f4d..c8fbbcf9aed20 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1045,6 +1045,16 @@ def test_constructor_infer_period(self, data_constructor): tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" + @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") + def test_construct_from_ints_including_iNaT_scalar_period_dtype(self): + series = Series([0, 1000, 2000, pd._libs.iNaT], dtype="period[D]") + + val = series[3] + assert isna(val) + + series[2] = val + assert isna(series[2]) + def test_constructor_period_incompatible_frequency(self): data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = Series(data) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index b85a53960b0f6..2fbed92567f71 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd -from pandas import Categorical, DataFrame, Series, date_range +from pandas import Categorical, DataFrame, Series import pandas._testing as tm @@ -120,18 +120,20 @@ def cmp(a, b): s.astype("object").astype(CategoricalDtype()), roundtrip_expected ) + def test_invalid_conversions(self): # invalid conversion (these are NOT a dtype) + cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) + ser = Series(np.random.randint(0, 10000, 100)).sort_values() + ser = pd.cut(ser, range(0, 10500, 500), right=False, labels=cat) + msg = ( "dtype '' " "not understood" ) - - for invalid in [ - lambda x: x.astype(Categorical), - lambda x: x.astype("object").astype(Categorical), - ]: - with pytest.raises(TypeError, match=msg): - invalid(s) + with pytest.raises(TypeError, match=msg): + ser.astype(Categorical) + with pytest.raises(TypeError, match=msg): + ser.astype("object").astype(Categorical) @pytest.mark.parametrize("dtype", np.typecodes["All"]) def test_astype_empty_constructor_equality(self, dtype): @@ -148,27 +150,6 @@ def test_astype_empty_constructor_equality(self, dtype): as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) - def test_intercept_astype_object(self): - series = Series(date_range("1/1/2000", periods=10)) - - # This test no longer makes sense, as - # Series is by default already M8[ns]. - expected = series.astype("object") - - df = DataFrame({"a": series, "b": np.random.randn(len(series))}) - exp_dtypes = Series( - [np.dtype("datetime64[ns]"), np.dtype("float64")], index=["a", "b"] - ) - tm.assert_series_equal(df.dtypes, exp_dtypes) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() - - df = DataFrame({"a": series, "b": ["foo"] * len(series)}) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() - def test_series_to_categorical(self): # see gh-16524: test conversion of Series to Categorical series = Series(["a", "b", "c"]) @@ -178,19 +159,6 @@ def test_series_to_categorical(self): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "data", - [ - pd.period_range("2000", periods=4), - pd.IntervalIndex.from_breaks([1, 2, 3, 4]), - ], - ) - def test_values_compatibility(self, data): - # https://github.com/pandas-dev/pandas/issues/23995 - result = Series(data).values - expected = np.array(data.astype(object)) - tm.assert_numpy_array_equal(result, expected) - def test_reindex_astype_order_consistency(self): # GH 17444 s = Series([1, 2, 3], index=[2, 0, 1]) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py deleted file mode 100644 index d079111aa12d6..0000000000000 --- a/pandas/tests/series/test_period.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas import DataFrame, Series, period_range - - -class TestSeriesPeriod: - def setup_method(self, method): - self.series = Series(period_range("2000-01-01", periods=10, freq="D")) - - # --------------------------------------------------------------------- - # NaT support - - @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") - def test_NaT_scalar(self): - series = Series([0, 1000, 2000, pd._libs.iNaT], dtype="period[D]") - - val = series[3] - assert pd.isna(val) - - series[2] = val - assert pd.isna(series[2]) - - def test_intercept_astype_object(self): - expected = self.series.astype("object") - - df = DataFrame({"a": self.series, "b": np.random.randn(len(self.series))}) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() - - df = DataFrame({"a": self.series, "b": ["foo"] * len(self.series)}) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py deleted file mode 100644 index 8b32be45e8d57..0000000000000 --- a/pandas/tests/series/test_timeseries.py +++ /dev/null @@ -1,71 +0,0 @@ -import numpy as np - -import pandas as pd -from pandas import DataFrame, Series, date_range, timedelta_range -import pandas._testing as tm - - -class TestTimeSeries: - def test_contiguous_boolean_preserve_freq(self): - rng = date_range("1/1/2000", "3/1/2000", freq="B") - - mask = np.zeros(len(rng), dtype=bool) - mask[10:20] = True - - masked = rng[mask] - expected = rng[10:20] - assert expected.freq == rng.freq - tm.assert_index_equal(masked, expected) - - mask[22] = True - masked = rng[mask] - assert masked.freq is None - - def test_promote_datetime_date(self): - rng = date_range("1/1/2000", periods=20) - ts = Series(np.random.randn(20), index=rng) - - ts_slice = ts[5:] - ts2 = ts_slice.copy() - ts2.index = [x.date() for x in ts2.index] - - result = ts + ts2 - result2 = ts2 + ts - expected = ts + ts[5:] - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - - # test asfreq - result = ts2.asfreq("4H", method="ffill") - expected = ts[5:].asfreq("4H", method="ffill") - tm.assert_series_equal(result, expected) - - result = rng.get_indexer(ts2.index) - expected = rng.get_indexer(ts_slice.index) - tm.assert_numpy_array_equal(result, expected) - - def test_series_map_box_timedelta(self): - # GH 11349 - s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) - - def f(x): - return x.total_seconds() - - s.map(f) - s.apply(f) - DataFrame(s).applymap(f) - - def test_view_tz(self): - # GH#24024 - ser = Series(pd.date_range("2000", periods=4, tz="US/Central")) - result = ser.view("i8") - expected = Series( - [ - 946706400000000000, - 946792800000000000, - 946879200000000000, - 946965600000000000, - ] - ) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 366a1970f6f64..81d866ba63bc0 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -9,7 +9,7 @@ from pandas.compat.numpy import np_version_under1p17 import pandas as pd -from pandas import Series, Timestamp +from pandas import Series import pandas._testing as tm from pandas.core import ops import pandas.core.common as com @@ -109,15 +109,6 @@ def test_maybe_match_name(left, right, expected): assert ops.common._maybe_match_name(left, right) == expected -def test_dict_compat(): - data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} - data_unchanged = {1: 2, 3: 4, 5: 6} - expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} - assert com.dict_compat(data_datetime64) == expected - assert com.dict_compat(expected) == expected - assert com.dict_compat(data_unchanged) == data_unchanged - - def test_standardize_mapping(): # No uninitialized defaultdicts msg = r"to_dict\(\) only accepts initialized defaultdicts" diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 1c9fd46ae451f..5f85ae2ec2318 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -453,3 +453,10 @@ def test_extension_array_codes(self, verify, na_sentinel): expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(codes, expected_codes) + + +def test_mixed_str_nan(): + values = np.array(["b", np.nan, "a", "b"], dtype=object) + result = safe_sort(values) + expected = np.array([np.nan, "a", "b", "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ebe118252c8cf..10bda16655586 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -36,6 +36,16 @@ class TestTimeConversionFormats: + @pytest.mark.parametrize("readonly", [True, False]) + def test_to_datetime_readonly(self, readonly): + # GH#34857 + arr = np.array([], dtype=object) + if readonly: + arr.setflags(write=False) + result = to_datetime(arr) + expected = to_datetime([]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format(self, cache): values = ["1/1/2000", "1/2/2000", "1/3/2000"] diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 8e48295c533cc..5be7e81df53f2 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -9,6 +9,16 @@ class TestTimedeltas: + @pytest.mark.parametrize("readonly", [True, False]) + def test_to_timedelta_readonly(self, readonly): + # GH#34857 + arr = np.array([], dtype=object) + if readonly: + arr.setflags(write=False) + result = to_timedelta(arr) + expected = to_timedelta([]) + tm.assert_index_equal(result, expected) + def test_to_timedelta(self): result = to_timedelta(["", ""]) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2c8439aae75e5..02bcfab8d3388 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1087,3 +1087,18 @@ def test_rolling_corr_timedelta_index(index, window): result = x.rolling(window).corr(y) expected = Series([np.nan, np.nan, 1, 1, 1], index=index) tm.assert_almost_equal(result, expected) + + +def test_groupby_rolling_nan_included(): + # GH 35542 + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + df = DataFrame(data) + result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean() + expected = DataFrame( + {"B": [0.0, 2.0, 3.0, 1.0, 4.0]}, + index=pd.MultiIndex.from_tuples( + [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)], + names=["group", None], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 7b648a589bc61..9c58a55cb907e 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -474,7 +474,7 @@ def main( sys.exit( main( - function=globals().get(args.validation_type), # type: ignore + function=globals().get(args.validation_type), source_path=args.paths, output_format=args.format, )