Skip to content

ERR: Improve error message and doc for invalid labels in cut/qcut #30691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,7 @@ Reshaping
- Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`)
- Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`)
- Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`)
-
- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`)

Sparse
^^^^^^
Expand Down
20 changes: 15 additions & 5 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_integer,
is_list_like,
is_scalar,
is_timedelta64_dtype,
)
Expand Down Expand Up @@ -65,11 +66,12 @@ def cut(
``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
indicate (1,2], (2,3], (3,4]. This argument is ignored when
`bins` is an IntervalIndex.
labels : array or bool, optional
labels : array or False, default None
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
bins. This affects the type of the output container (see below).
This argument is ignored when `bins` is an IntervalIndex.
This argument is ignored when `bins` is an IntervalIndex. If True,
raises an error.
retbins : bool, default False
Whether to return the bins or not. Useful when bins is provided
as a scalar.
Expand Down Expand Up @@ -286,10 +288,10 @@ def qcut(
q : int or list-like of int
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
labels : array or bool, default None
labels : array or False, default None
Used as labels for the resulting bins. Must be of the same length as
the resulting bins. If False, return only integer indicators of the
bins.
bins. If True, raises an error.
retbins : bool, optional
Whether to return the (bins, labels) or not. Can be useful if bins
is given as a scalar.
Expand Down Expand Up @@ -391,15 +393,23 @@ def _bins_to_cuts(
has_nas = na_mask.any()

if labels is not False:
if labels is None:
if not (labels is None or is_list_like(labels)):
raise ValueError(
"Bin labels must either be False, None or passed in as a "
"list-like argument"
)

elif labels is None:
labels = _format_labels(
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
)

else:
if len(labels) != len(bins) - 1:
raise ValueError(
"Bin labels must be one fewer than the number of bin edges"
)

if not is_categorical_dtype(labels):
labels = Categorical(labels, categories=labels, ordered=True)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,3 +603,12 @@ def test_cut_bool_coercion_to_int(bins, box, compare):
expected = cut(data_expected, bins, duplicates="drop")
result = cut(data_result, bins, duplicates="drop")
compare(result, expected)


@pytest.mark.parametrize("labels", ["foo", 1, True])
def test_cut_incorrect_labels(labels):
# GH 13318
values = range(5)
msg = "Bin labels must either be False, None or passed in as a list-like argument"
with pytest.raises(ValueError, match=msg):
cut(values, 4, labels=labels)
32 changes: 32 additions & 0 deletions pandas/tests/reshape/test_qcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,38 @@ def test_qcut_return_intervals():
tm.assert_series_equal(res, exp)


@pytest.mark.parametrize("labels", ["foo", 1, True])
def test_qcut_incorrect_labels(labels):
# GH 13318
values = range(5)
msg = "Bin labels must either be False, None or passed in as a list-like argument"
with pytest.raises(ValueError, match=msg):
qcut(values, 4, labels=labels)


@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))])
def test_qcut_wrong_length_labels(labels):
# GH 13318
values = range(10)
msg = "Bin labels must be one fewer than the number of bin edges"
with pytest.raises(ValueError, match=msg):
qcut(values, 4, labels=labels)


@pytest.mark.parametrize(
"labels, expected",
[
(["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)),
(list(range(3)), Categorical([0, 1, 2], ordered=True)),
],
)
def test_qcut_list_like_labels(labels, expected):
# GH 13318
values = range(3)
result = qcut(values, 3, labels=labels)
tm.assert_categorical_equal(result, expected)


@pytest.mark.parametrize(
"kwargs,msg",
[
Expand Down