From f8e9c113e7d4bed1418ea5cc99b46df1315ef828 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 14 Jan 2021 14:35:09 +0700 Subject: [PATCH 1/2] REF: extract function _refine_percentiles --- pandas/core/describe.py | 53 +++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 4a67725449ca8..7eb35fe75620f 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -62,26 +62,7 @@ def describe_ndframe( if obj.ndim == 2 and obj.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") - if percentiles is not None: - # explicit conversion of `percentiles` to list - percentiles = list(percentiles) - - # get them all to be in [0, 1] - validate_percentile(percentiles) - - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) - else: - percentiles = np.array([0.25, 0.5, 0.75]) - - # sort and check for duplicates - unique_pcts = np.unique(percentiles) - assert percentiles is not None - if len(unique_pcts) < len(percentiles): - raise ValueError("percentiles cannot contain duplicates") - percentiles = unique_pcts + percentiles = _refine_percentiles(percentiles) if obj.ndim == 1: # Incompatible return value type @@ -263,3 +244,35 @@ def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series return describe_numeric_1d(data, percentiles) else: return describe_categorical_1d(data, is_series) + + +def _refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]: + """Ensure that percentiles are unique and sorted. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. + """ + if percentiles is None: + return np.array([0.25, 0.5, 0.75]) + + # explicit conversion of `percentiles` to list + percentiles = list(percentiles) + + # get them all to be in [0, 1] + validate_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) + + percentiles = np.asarray(percentiles) + + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + assert percentiles is not None + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + + return unique_pcts From 838d03d02a899967c1e040eec855f24a5a04751f Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 14 Jan 2021 14:35:34 +0700 Subject: [PATCH 2/2] TYP: type remaining arguments --- pandas/core/describe.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 7eb35fe75620f..4c178de2a182e 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -4,7 +4,7 @@ Method NDFrame.describe() delegates actual execution to function describe_ndframe(). """ -from typing import TYPE_CHECKING, List, Optional, Sequence, Union +from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast import warnings import numpy as np @@ -65,10 +65,11 @@ def describe_ndframe( percentiles = _refine_percentiles(percentiles) if obj.ndim == 1: + series = cast("Series", obj) # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] return describe_1d( - obj, + series, percentiles, datetime_is_numeric, is_series=True, @@ -106,14 +107,14 @@ def describe_ndframe( return d -def describe_numeric_1d(series, percentiles) -> "Series": +def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> "Series": """Describe series containing numerical data. Parameters ---------- series : Series Series to be described. - percentiles : list-like of numbers, optional + percentiles : list-like of numbers The percentiles to include in the output. """ from pandas import Series @@ -129,7 +130,7 @@ def describe_numeric_1d(series, percentiles) -> "Series": return Series(d, index=stat_index, name=series.name) -def describe_categorical_1d(data, is_series) -> "Series": +def describe_categorical_1d(data: "Series", is_series: bool) -> "Series": """Describe series containing categorical data. Parameters @@ -191,14 +192,14 @@ def describe_categorical_1d(data, is_series) -> "Series": return Series(result, index=names, name=data.name, dtype=dtype) -def describe_timestamp_1d(data, percentiles) -> "Series": +def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> "Series": """Describe series containing datetime64 dtype. Parameters ---------- data : Series Series to be described. - percentiles : list-like of numbers, optional + percentiles : list-like of numbers The percentiles to include in the output. """ # GH-30164 @@ -215,14 +216,20 @@ def describe_timestamp_1d(data, percentiles) -> "Series": return Series(d, index=stat_index, name=data.name) -def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series": +def describe_1d( + data: "Series", + percentiles: Sequence[float], + datetime_is_numeric: bool, + *, + is_series: bool, +) -> "Series": """Describe series. Parameters ---------- data : Series Series to be described. - percentiles : list-like of numbers, optional + percentiles : list-like of numbers The percentiles to include in the output. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric.