Skip to content

Commit 7f8bf8e

Browse files
authored
BUG: obj.loc[listlike] with missing keys and CategoricalIndex (#37901)
1 parent 3639cb7 commit 7f8bf8e

File tree

5 files changed

+65
-64
lines changed

5 files changed

+65
-64
lines changed

doc/source/whatsnew/v1.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,7 @@ Indexing
587587
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`)
588588
- Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`)
589589
- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty :class:`DataFrame` with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`)
590+
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`)
590591

591592
Missing
592593
^^^^^^^

pandas/core/indexes/category.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -576,23 +576,11 @@ def _convert_list_indexer(self, keyarr):
576576
# the categories
577577

578578
if self.categories._defer_to_indexing:
579+
# See tests.indexing.interval.test_interval:test_loc_getitem_frame
579580
indexer = self.categories._convert_list_indexer(keyarr)
580581
return Index(self.codes).get_indexer_for(indexer)
581582

582-
msg = "a list-indexer must only include values that are in the categories"
583-
if self.hasnans:
584-
msg += " or NA"
585-
try:
586-
codes = self._data._validate_setitem_value(keyarr)
587-
except (ValueError, TypeError) as err:
588-
if "Index data must be 1-dimensional" in str(err):
589-
# e.g. test_setitem_ndarray_3d
590-
raise
591-
raise KeyError(msg)
592-
if not self.hasnans and (codes == -1).any():
593-
raise KeyError(msg)
594-
595-
return self.get_indexer(keyarr)
583+
return self.get_indexer_for(keyarr)
596584

597585
@doc(Index._maybe_cast_slice_bound)
598586
def _maybe_cast_slice_bound(self, label, side: str, kind):

pandas/core/indexing.py

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,9 +1246,7 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False):
12461246
indexer, keyarr = ax._convert_listlike_indexer(key)
12471247
# We only act on all found values:
12481248
if indexer is not None and (indexer != -1).all():
1249-
self._validate_read_indexer(
1250-
keyarr, indexer, axis, raise_missing=raise_missing
1251-
)
1249+
# _validate_read_indexer is a no-op if no -1s, so skip
12521250
return ax[indexer], indexer
12531251

12541252
if ax._index_as_unique:
@@ -1309,21 +1307,15 @@ def _validate_read_indexer(
13091307
not_found = list(set(key) - set(ax))
13101308
raise KeyError(f"{not_found} not in index")
13111309

1312-
# we skip the warning on Categorical
1313-
# as this check is actually done (check for
1314-
# non-missing values), but a bit later in the
1315-
# code, so we want to avoid warning & then
1316-
# just raising
1317-
if not ax.is_categorical():
1318-
not_found = key[missing_mask]
1319-
1320-
with option_context("display.max_seq_items", 10, "display.width", 80):
1321-
raise KeyError(
1322-
"Passing list-likes to .loc or [] with any missing labels "
1323-
"is no longer supported. "
1324-
f"The following labels were missing: {not_found}. "
1325-
"See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501
1326-
)
1310+
not_found = key[missing_mask]
1311+
1312+
with option_context("display.max_seq_items", 10, "display.width", 80):
1313+
raise KeyError(
1314+
"Passing list-likes to .loc or [] with any missing labels "
1315+
"is no longer supported. "
1316+
f"The following labels were missing: {not_found}. "
1317+
"See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501
1318+
)
13271319

13281320

13291321
@doc(IndexingMixin.iloc)

pandas/tests/indexing/test_categorical.py

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import re
2+
13
import numpy as np
24
import pytest
35

@@ -254,41 +256,45 @@ def test_slicing_doc_examples(self):
254256
)
255257
tm.assert_frame_equal(result, expected)
256258

257-
def test_loc_listlike(self):
258-
259+
def test_loc_getitem_listlike_labels(self):
259260
# list of labels
260261
result = self.df.loc[["c", "a"]]
261262
expected = self.df.iloc[[4, 0, 1, 5]]
262263
tm.assert_frame_equal(result, expected, check_index_type=True)
263264

264-
result = self.df2.loc[["a", "b", "e"]]
265-
exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B")
266-
expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
267-
tm.assert_frame_equal(result, expected, check_index_type=True)
265+
def test_loc_getitem_listlike_unused_category(self):
266+
# GH#37901 a label that is in index.categories but not in index
267+
# listlike containing an element in the categories but not in the values
268+
msg = (
269+
"The following labels were missing: CategoricalIndex(['e'], "
270+
"categories=['c', 'a', 'b', 'e'], ordered=False, name='B', "
271+
"dtype='category')"
272+
)
273+
with pytest.raises(KeyError, match=re.escape(msg)):
274+
self.df2.loc[["a", "b", "e"]]
268275

276+
def test_loc_getitem_label_unused_category(self):
269277
# element in the categories but not in the values
270278
with pytest.raises(KeyError, match=r"^'e'$"):
271279
self.df2.loc["e"]
272280

273-
# assign is ok
281+
def test_loc_getitem_non_category(self):
282+
# not all labels in the categories
283+
msg = (
284+
"The following labels were missing: Index(['d'], dtype='object', name='B')"
285+
)
286+
with pytest.raises(KeyError, match=re.escape(msg)):
287+
self.df2.loc[["a", "d"]]
288+
289+
def test_loc_setitem_expansion_label_unused_category(self):
290+
# assigning with a label that is in the categories but not in the index
274291
df = self.df2.copy()
275292
df.loc["e"] = 20
276293
result = df.loc[["a", "b", "e"]]
277294
exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B")
278295
expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index)
279296
tm.assert_frame_equal(result, expected)
280297

281-
df = self.df2.copy()
282-
result = df.loc[["a", "b", "e"]]
283-
exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B")
284-
expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
285-
tm.assert_frame_equal(result, expected, check_index_type=True)
286-
287-
# not all labels in the categories
288-
msg = "a list-indexer must only include values that are in the categories"
289-
with pytest.raises(KeyError, match=msg):
290-
self.df2.loc[["a", "d"]]
291-
292298
def test_loc_listlike_dtypes(self):
293299
# GH 11586
294300

@@ -309,8 +315,8 @@ def test_loc_listlike_dtypes(self):
309315
exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
310316
tm.assert_frame_equal(res, exp, check_index_type=True)
311317

312-
msg = "a list-indexer must only include values that are in the categories"
313-
with pytest.raises(KeyError, match=msg):
318+
msg = "The following labels were missing: Index(['x'], dtype='object')"
319+
with pytest.raises(KeyError, match=re.escape(msg)):
314320
df.loc[["a", "x"]]
315321

316322
# duplicated categories and codes
@@ -332,8 +338,7 @@ def test_loc_listlike_dtypes(self):
332338
)
333339
tm.assert_frame_equal(res, exp, check_index_type=True)
334340

335-
msg = "a list-indexer must only include values that are in the categories"
336-
with pytest.raises(KeyError, match=msg):
341+
with pytest.raises(KeyError, match=re.escape(msg)):
337342
df.loc[["a", "x"]]
338343

339344
# contains unused category
@@ -347,13 +352,6 @@ def test_loc_listlike_dtypes(self):
347352
)
348353
tm.assert_frame_equal(res, exp, check_index_type=True)
349354

350-
res = df.loc[["a", "e"]]
351-
exp = DataFrame(
352-
{"A": [1, 3, np.nan], "B": [5, 7, np.nan]},
353-
index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")),
354-
)
355-
tm.assert_frame_equal(res, exp, check_index_type=True)
356-
357355
# duplicated slice
358356
res = df.loc[["a", "a", "b"]]
359357
exp = DataFrame(
@@ -362,10 +360,27 @@ def test_loc_listlike_dtypes(self):
362360
)
363361
tm.assert_frame_equal(res, exp, check_index_type=True)
364362

365-
msg = "a list-indexer must only include values that are in the categories"
366-
with pytest.raises(KeyError, match=msg):
363+
with pytest.raises(KeyError, match=re.escape(msg)):
367364
df.loc[["a", "x"]]
368365

366+
def test_loc_getitem_listlike_unused_category_raises_keyerro(self):
367+
# key that is an *unused* category raises
368+
index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
369+
df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
370+
371+
with pytest.raises(KeyError, match="e"):
372+
# For comparison, check the scalar behavior
373+
df.loc["e"]
374+
375+
msg = (
376+
"Passing list-likes to .loc or [] with any missing labels is no "
377+
"longer supported. The following labels were missing: "
378+
"CategoricalIndex(['e'], categories=['a', 'b', 'c', 'd', 'e'], "
379+
"ordered=False, dtype='category'). See https"
380+
)
381+
with pytest.raises(KeyError, match=re.escape(msg)):
382+
df.loc[["a", "e"]]
383+
369384
def test_ix_categorical_index(self):
370385
# GH 12531
371386
df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ"))

pandas/tests/indexing/test_loc.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1674,7 +1674,12 @@ def test_loc_getitem_list_of_labels_categoricalindex_with_na(self, box):
16741674
ser2 = ser[:-1]
16751675
ci2 = ci[1:]
16761676
# but if there are no NAs present, this should raise KeyError
1677-
msg = "a list-indexer must only include values that are in the categories"
1677+
msg = (
1678+
r"Passing list-likes to .loc or \[\] with any missing labels is no "
1679+
"longer supported. The following labels were missing: "
1680+
r"(Categorical)?Index\(\[nan\], .*\). "
1681+
"See https"
1682+
)
16781683
with pytest.raises(KeyError, match=msg):
16791684
ser2.loc[box(ci2)]
16801685

0 commit comments

Comments
 (0)