From 2cb234360ca21dac7282d6f435ed3566daddcbf6 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sat, 4 Jan 2020 19:07:15 +0000 Subject: [PATCH 01/10] update docs and error messages for labels --- pandas/core/reshape/tile.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 8cf51ae09fbcb..7c97fa88b4b33 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -15,6 +15,7 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, + is_list_like, is_scalar, is_timedelta64_dtype, ) @@ -65,11 +66,12 @@ def cut( ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. - labels : array or bool, optional + labels : array or False, default None Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). - This argument is ignored when `bins` is an IntervalIndex. + This argument is ignored when `bins` is an IntervalIndex. If True, + raises an error. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. @@ -286,10 +288,10 @@ def qcut( q : int or list-like of int Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. - labels : array or bool, default None + labels : array or False, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the - bins. + bins. If True, raises an error. retbins : bool, optional Whether to return the (bins, labels) or not. Can be useful if bins is given as a scalar. @@ -395,11 +397,18 @@ def _bins_to_cuts( labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) - else: + elif labels: + raise ValueError( + "User desired bin labels must be passed in as an argument, " + "not just `True`" + ) + elif is_list_like(labels): if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) + else: + labels = Categorical(labels, categories=labels, ordered=True) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) From 5d8c8d9ab5b926f3d9b6d97ee240ffbed1cc7c7d Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sat, 4 Jan 2020 19:07:45 +0000 Subject: [PATCH 02/10] add test for label is True --- pandas/tests/reshape/test_qcut.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index c5ca05056a306..aef94fb6cb6a9 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -130,6 +130,14 @@ def test_qcut_return_intervals(): tm.assert_series_equal(res, exp) +def test_qcut_labels_true(): + # issue 13318 + values = range(5) + msg = "User desired bin labels must be passed in as an argument, not just `True`" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=True) + + @pytest.mark.parametrize( "kwargs,msg", [ From b9928e71480ea066521973430cb98edd38a0dd99 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sat, 4 Jan 2020 23:47:26 +0000 Subject: [PATCH 03/10] switch to labels is True and add more tests --- pandas/core/reshape/tile.py | 4 +- pandas/tests/reshape/test_qcut 2.py | 256 ++++++++++++++++++++++++++++ pandas/tests/reshape/test_qcut.py | 24 ++- 3 files changed, 280 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/reshape/test_qcut 2.py diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7c97fa88b4b33..c17be67d3f7f6 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -397,7 +397,7 @@ def _bins_to_cuts( labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) - elif labels: + elif labels is True: raise ValueError( "User desired bin labels must be passed in as an argument, " "not just `True`" @@ -407,8 +407,6 @@ def _bins_to_cuts( raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) - else: - labels = Categorical(labels, categories=labels, ordered=True) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) diff --git a/pandas/tests/reshape/test_qcut 2.py b/pandas/tests/reshape/test_qcut 2.py new file mode 100644 index 0000000000000..eca9b11bd4364 --- /dev/null +++ b/pandas/tests/reshape/test_qcut 2.py @@ -0,0 +1,256 @@ +import os + +import numpy as np +import pytest + +from pandas import ( + Categorical, + DatetimeIndex, + Interval, + IntervalIndex, + NaT, + Series, + TimedeltaIndex, + Timestamp, + cut, + date_range, + isna, + qcut, + timedelta_range, +) +from pandas.api.types import CategoricalDtype as CDT +from pandas.core.algorithms import quantile +import pandas.util.testing as tm + +from pandas.tseries.offsets import Day, Nano + + +def test_qcut(): + arr = np.random.randn(1000) + + # We store the bins as Index that have been + # rounded to comparisons are a bit tricky. + labels, bins = qcut(arr, 4, retbins=True) + ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0]) + + result = labels.categories.left.values + assert np.allclose(result, ex_bins[:-1], atol=1e-2) + + result = labels.categories.right.values + assert np.allclose(result, ex_bins[1:], atol=1e-2) + + ex_levels = cut(arr, ex_bins, include_lowest=True) + tm.assert_categorical_equal(labels, ex_levels) + + +def test_qcut_bounds(): + arr = np.random.randn(1000) + + factor = qcut(arr, 10, labels=False) + assert len(np.unique(factor)) == 10 + + +def test_qcut_specify_quantiles(): + arr = np.random.randn(100) + factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0]) + + expected = qcut(arr, 4) + tm.assert_categorical_equal(factor, expected) + + +def test_qcut_all_bins_same(): + with pytest.raises(ValueError, match="edges.*unique"): + qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) + + +def test_qcut_include_lowest(): + values = np.arange(10) + ii = qcut(values, 4) + + ex_levels = IntervalIndex( + [ + Interval(-0.001, 2.25), + Interval(2.25, 4.5), + Interval(4.5, 6.75), + Interval(6.75, 9), + ] + ) + tm.assert_index_equal(ii.categories, ex_levels) + + +def test_qcut_nas(): + arr = np.random.randn(100) + arr[:20] = np.nan + + result = qcut(arr, 4) + assert isna(result[:20]).all() + + +def test_qcut_index(): + result = qcut([0, 2], 2) + intervals = [Interval(-0.001, 1), Interval(1, 2)] + + expected = Categorical(intervals, ordered=True) + tm.assert_categorical_equal(result, expected) + + +def test_qcut_binning_issues(datapath): + # see gh-1978, gh-1979 + cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv")) + arr = np.loadtxt(cut_file) + result = qcut(arr, 20) + + starts = [] + ends = [] + + for lev in np.unique(result): + s = lev.left + e = lev.right + assert s != e + + starts.append(float(s)) + ends.append(float(e)) + + for (sp, sn), (ep, en) in zip( + zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:]) + ): + assert sp < sn + assert ep < en + assert ep <= sn + + +def test_qcut_return_intervals(): + ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = qcut(ser, [0, 0.333, 0.666, 1]) + + exp_levels = np.array( + [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] + ) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + tm.assert_series_equal(res, exp) + + +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), + ], +) +def test_qcut_duplicates_bin(kwargs, msg): + # see gh-7751 + values = [0, 0, 0, 0, 1, 2, 3] + + if msg is not None: + with pytest.raises(ValueError, match=msg): + qcut(values, 3, **kwargs) + else: + result = qcut(values, 3, **kwargs) + expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)]) + tm.assert_index_equal(result.categories, expected) + + +@pytest.mark.parametrize( + "data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)] +) +@pytest.mark.parametrize("length", [1, 2]) +@pytest.mark.parametrize("labels", [None, False]) +def test_single_quantile(data, start, end, length, labels): + # see gh-15431 + ser = Series([data] * length) + result = qcut(ser, 1, labels=labels) + + if labels is None: + intervals = IntervalIndex([Interval(start, end)] * length, closed="right") + expected = Series(intervals).astype(CDT(ordered=True)) + else: + expected = Series([0] * length) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser", + [ + Series(DatetimeIndex(["20180101", NaT, "20180103"])), + Series(TimedeltaIndex(["0 days", NaT, "2 days"])), + ], + ids=lambda x: str(x.dtype), +) +def test_qcut_nat(ser): + # see gh-19768 + intervals = IntervalIndex.from_tuples( + [(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])] + ) + expected = Series(Categorical(intervals, ordered=True)) + + result = qcut(ser, 2) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)]) +def test_datetime_tz_qcut(bins): + # see gh-19872 + tz = "US/Eastern" + ser = Series(date_range("20130101", periods=3, tz=tz)) + + result = qcut(ser, bins) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:59:59.999999999", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + ).astype(CDT(ordered=True)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "arg,expected_bins", + [ + [ + timedelta_range("1day", periods=3), + TimedeltaIndex(["1 days", "2 days", "3 days"]), + ], + [ + date_range("20180101", periods=3), + DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]), + ], + ], +) +def test_date_like_qcut_bins(arg, expected_bins): + # see gh-19891 + ser = Series(arg) + result, result_bins = qcut(ser, 2, retbins=True) + tm.assert_index_equal(result_bins, expected_bins) + + +@pytest.mark.parametrize("bins", [6, 7]) +@pytest.mark.parametrize( + "box, compare", + [ + (Series, tm.assert_series_equal), + (np.array, tm.assert_categorical_equal), + (list, tm.assert_equal), + ], +) +def test_qcut_bool_coercion_to_int(bins, box, compare): + # issue 20303 + data_expected = box([0, 1, 1, 0, 1] * 10) + data_result = box([False, True, True, False, True] * 10) + expected = qcut(data_expected, bins, duplicates="drop") + result = qcut(data_result, bins, duplicates="drop") + compare(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index aef94fb6cb6a9..8aa0c644b0d3e 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -131,13 +131,35 @@ def test_qcut_return_intervals(): def test_qcut_labels_true(): - # issue 13318 + # GH 13318 values = range(5) msg = "User desired bin labels must be passed in as an argument, not just `True`" with pytest.raises(ValueError, match=msg): qcut(values, 4, labels=True) +@pytest.mark.parametrize("kwargs", [["a", "b", "c"], list(range(3))]) +def test_qcut_wrong_length_labels(kwargs): + # GH 13318 + values = range(10) + msg = "Bin labels must be one fewer than the number of bin edges" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=kwargs) + + +@pytest.mark.parametrize( + "kwargs, expected", + [ + (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)), + (list(range(3)), Categorical([0, 1, 2], ordered=True)), + ], +) +def test_qcut_list_like_labels(kwargs, expected): + # GH 13318 + values = range(10) + qcut(values, 3, labels=kwargs) + + @pytest.mark.parametrize( "kwargs,msg", [ From 234e645dbf1961ede6e2c6874486970742048be0 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sun, 5 Jan 2020 00:03:04 +0000 Subject: [PATCH 04/10] Remove module added by mistake --- pandas/tests/reshape/test_qcut 2.py | 256 ---------------------------- 1 file changed, 256 deletions(-) delete mode 100644 pandas/tests/reshape/test_qcut 2.py diff --git a/pandas/tests/reshape/test_qcut 2.py b/pandas/tests/reshape/test_qcut 2.py deleted file mode 100644 index eca9b11bd4364..0000000000000 --- a/pandas/tests/reshape/test_qcut 2.py +++ /dev/null @@ -1,256 +0,0 @@ -import os - -import numpy as np -import pytest - -from pandas import ( - Categorical, - DatetimeIndex, - Interval, - IntervalIndex, - NaT, - Series, - TimedeltaIndex, - Timestamp, - cut, - date_range, - isna, - qcut, - timedelta_range, -) -from pandas.api.types import CategoricalDtype as CDT -from pandas.core.algorithms import quantile -import pandas.util.testing as tm - -from pandas.tseries.offsets import Day, Nano - - -def test_qcut(): - arr = np.random.randn(1000) - - # We store the bins as Index that have been - # rounded to comparisons are a bit tricky. - labels, bins = qcut(arr, 4, retbins=True) - ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0]) - - result = labels.categories.left.values - assert np.allclose(result, ex_bins[:-1], atol=1e-2) - - result = labels.categories.right.values - assert np.allclose(result, ex_bins[1:], atol=1e-2) - - ex_levels = cut(arr, ex_bins, include_lowest=True) - tm.assert_categorical_equal(labels, ex_levels) - - -def test_qcut_bounds(): - arr = np.random.randn(1000) - - factor = qcut(arr, 10, labels=False) - assert len(np.unique(factor)) == 10 - - -def test_qcut_specify_quantiles(): - arr = np.random.randn(100) - factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0]) - - expected = qcut(arr, 4) - tm.assert_categorical_equal(factor, expected) - - -def test_qcut_all_bins_same(): - with pytest.raises(ValueError, match="edges.*unique"): - qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) - - -def test_qcut_include_lowest(): - values = np.arange(10) - ii = qcut(values, 4) - - ex_levels = IntervalIndex( - [ - Interval(-0.001, 2.25), - Interval(2.25, 4.5), - Interval(4.5, 6.75), - Interval(6.75, 9), - ] - ) - tm.assert_index_equal(ii.categories, ex_levels) - - -def test_qcut_nas(): - arr = np.random.randn(100) - arr[:20] = np.nan - - result = qcut(arr, 4) - assert isna(result[:20]).all() - - -def test_qcut_index(): - result = qcut([0, 2], 2) - intervals = [Interval(-0.001, 1), Interval(1, 2)] - - expected = Categorical(intervals, ordered=True) - tm.assert_categorical_equal(result, expected) - - -def test_qcut_binning_issues(datapath): - # see gh-1978, gh-1979 - cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv")) - arr = np.loadtxt(cut_file) - result = qcut(arr, 20) - - starts = [] - ends = [] - - for lev in np.unique(result): - s = lev.left - e = lev.right - assert s != e - - starts.append(float(s)) - ends.append(float(e)) - - for (sp, sn), (ep, en) in zip( - zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:]) - ): - assert sp < sn - assert ep < en - assert ep <= sn - - -def test_qcut_return_intervals(): - ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) - res = qcut(ser, [0, 0.333, 0.666, 1]) - - exp_levels = np.array( - [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] - ) - exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) - tm.assert_series_equal(res, exp) - - -@pytest.mark.parametrize( - "kwargs,msg", - [ - (dict(duplicates="drop"), None), - (dict(), "Bin edges must be unique"), - (dict(duplicates="raise"), "Bin edges must be unique"), - (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), - ], -) -def test_qcut_duplicates_bin(kwargs, msg): - # see gh-7751 - values = [0, 0, 0, 0, 1, 2, 3] - - if msg is not None: - with pytest.raises(ValueError, match=msg): - qcut(values, 3, **kwargs) - else: - result = qcut(values, 3, **kwargs) - expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)]) - tm.assert_index_equal(result.categories, expected) - - -@pytest.mark.parametrize( - "data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)] -) -@pytest.mark.parametrize("length", [1, 2]) -@pytest.mark.parametrize("labels", [None, False]) -def test_single_quantile(data, start, end, length, labels): - # see gh-15431 - ser = Series([data] * length) - result = qcut(ser, 1, labels=labels) - - if labels is None: - intervals = IntervalIndex([Interval(start, end)] * length, closed="right") - expected = Series(intervals).astype(CDT(ordered=True)) - else: - expected = Series([0] * length) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "ser", - [ - Series(DatetimeIndex(["20180101", NaT, "20180103"])), - Series(TimedeltaIndex(["0 days", NaT, "2 days"])), - ], - ids=lambda x: str(x.dtype), -) -def test_qcut_nat(ser): - # see gh-19768 - intervals = IntervalIndex.from_tuples( - [(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])] - ) - expected = Series(Categorical(intervals, ordered=True)) - - result = qcut(ser, 2) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)]) -def test_datetime_tz_qcut(bins): - # see gh-19872 - tz = "US/Eastern" - ser = Series(date_range("20130101", periods=3, tz=tz)) - - result = qcut(ser, bins) - expected = Series( - IntervalIndex( - [ - Interval( - Timestamp("2012-12-31 23:59:59.999999999", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz), - ), - ] - ) - ).astype(CDT(ordered=True)) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "arg,expected_bins", - [ - [ - timedelta_range("1day", periods=3), - TimedeltaIndex(["1 days", "2 days", "3 days"]), - ], - [ - date_range("20180101", periods=3), - DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]), - ], - ], -) -def test_date_like_qcut_bins(arg, expected_bins): - # see gh-19891 - ser = Series(arg) - result, result_bins = qcut(ser, 2, retbins=True) - tm.assert_index_equal(result_bins, expected_bins) - - -@pytest.mark.parametrize("bins", [6, 7]) -@pytest.mark.parametrize( - "box, compare", - [ - (Series, tm.assert_series_equal), - (np.array, tm.assert_categorical_equal), - (list, tm.assert_equal), - ], -) -def test_qcut_bool_coercion_to_int(bins, box, compare): - # issue 20303 - data_expected = box([0, 1, 1, 0, 1] * 10) - data_result = box([False, True, True, False, True] * 10) - expected = qcut(data_expected, bins, duplicates="drop") - result = qcut(data_result, bins, duplicates="drop") - compare(result, expected) From 915bc506ba8eb0f03a7d5060c7e7fda377f944fd Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sun, 5 Jan 2020 00:45:06 +0000 Subject: [PATCH 05/10] add assertion in test --- pandas/tests/reshape/test_qcut.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 8aa0c644b0d3e..89946f6763fb4 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -148,16 +148,17 @@ def test_qcut_wrong_length_labels(kwargs): @pytest.mark.parametrize( - "kwargs, expected", + "labels, expected", [ (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)), (list(range(3)), Categorical([0, 1, 2], ordered=True)), ], ) -def test_qcut_list_like_labels(kwargs, expected): +def test_qcut_list_like_labels(labels, expected): # GH 13318 - values = range(10) - qcut(values, 3, labels=kwargs) + values = range(3) + result = qcut(values, 3, labels=labels) + tm.assert_categorical_equal(result, expected) @pytest.mark.parametrize( From 33ea2c60bc8fc2e30dfb3e0faaca89271511223d Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sun, 5 Jan 2020 00:51:20 +0000 Subject: [PATCH 06/10] change param naming --- pandas/tests/reshape/test_qcut.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 89946f6763fb4..da6d6823d0491 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -138,13 +138,13 @@ def test_qcut_labels_true(): qcut(values, 4, labels=True) -@pytest.mark.parametrize("kwargs", [["a", "b", "c"], list(range(3))]) -def test_qcut_wrong_length_labels(kwargs): +@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))]) +def test_qcut_wrong_length_labels(labels): # GH 13318 values = range(10) msg = "Bin labels must be one fewer than the number of bin edges" with pytest.raises(ValueError, match=msg): - qcut(values, 4, labels=kwargs) + qcut(values, 4, labels=labels) @pytest.mark.parametrize( From a1fb3a10317a6fd4173299976876ee2d6c111022 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sun, 5 Jan 2020 03:59:03 +0000 Subject: [PATCH 07/10] Add whats new note --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5b4761c3bc6c5..917ffba3ec0a7 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -999,7 +999,7 @@ Reshaping - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) -- +- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) Sparse ^^^^^^ From 174dc3d5abe3b4d4b9087f6a5066511f12cdca05 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Tue, 7 Jan 2020 17:33:19 +0000 Subject: [PATCH 08/10] Add case for non list, simplify condition block and extra test for cut --- pandas/core/reshape/tile.py | 20 +++++++++++--------- pandas/tests/reshape/test_cut.py | 9 +++++++++ pandas/tests/reshape/test_qcut.py | 7 ++++--- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index c17be67d3f7f6..1b9458feec9f5 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -393,20 +393,22 @@ def _bins_to_cuts( has_nas = na_mask.any() if labels is not False: - if labels is None: + if not (labels is None or is_list_like(labels)): + raise ValueError( + "Bin labels must either be False, None or passed in as a " + "list-like argument" + ) + + elif labels is None: labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) - elif labels is True: + + elif len(labels) != len(bins) - 1: raise ValueError( - "User desired bin labels must be passed in as an argument, " - "not just `True`" + "Bin labels must be one fewer than the number of bin edges" ) - elif is_list_like(labels): - if len(labels) != len(bins) - 1: - raise ValueError( - "Bin labels must be one fewer than the number of bin edges" - ) + if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index e52636d54ebe8..13b6f05ed304a 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -603,3 +603,12 @@ def test_cut_bool_coercion_to_int(bins, box, compare): expected = cut(data_expected, bins, duplicates="drop") result = cut(data_result, bins, duplicates="drop") compare(result, expected) + + +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_cut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + cut(values, 4, labels=labels) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index da6d6823d0491..95406a5ebf4f7 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -130,12 +130,13 @@ def test_qcut_return_intervals(): tm.assert_series_equal(res, exp) -def test_qcut_labels_true(): +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_qcut_incorrect_labels(labels): # GH 13318 values = range(5) - msg = "User desired bin labels must be passed in as an argument, not just `True`" + msg = "Bin labels must either be False, None or passed in as a list-like argument" with pytest.raises(ValueError, match=msg): - qcut(values, 4, labels=True) + qcut(values, 4, labels=labels) @pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))]) From 49ab9af0c7a93a8d86d4bb25d789d9e314d62f07 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Tue, 7 Jan 2020 17:44:09 +0000 Subject: [PATCH 09/10] linting --- pandas/core/reshape/tile.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 1b9458feec9f5..ab573807f5ff3 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -404,10 +404,11 @@ def _bins_to_cuts( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) - elif len(labels) != len(bins) - 1: - raise ValueError( - "Bin labels must be one fewer than the number of bin edges" - ) + else: + if len(labels) != len(bins) - 1: + raise ValueError( + "Bin labels must be one fewer than the number of " "bin edges" + ) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) From a3566e6d802f29dfb65686bef512c1d412194139 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Tue, 7 Jan 2020 17:48:08 +0000 Subject: [PATCH 10/10] more linting --- pandas/core/reshape/tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index ab573807f5ff3..2e3eb9170b15c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -407,7 +407,7 @@ def _bins_to_cuts( else: if len(labels) != len(bins) - 1: raise ValueError( - "Bin labels must be one fewer than the number of " "bin edges" + "Bin labels must be one fewer than the number of bin edges" ) if not is_categorical_dtype(labels):