Skip to content

BUG: Hash and compare tuple subclasses as builtin tuples #59286

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 22, 2024
Merged
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Other enhancements
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
Expand Down Expand Up @@ -231,6 +232,7 @@ Other API changes
^^^^^^^^^^^^^^^^^
- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
- :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`)
- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` during indexing operations (:issue:`57922`)
- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`)
- Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`)
- Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`)
Expand Down
6 changes: 4 additions & 2 deletions pandas/_libs/include/pandas/vendored/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,8 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) {
if (PyComplex_CheckExact(a)) {
return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b);
}
if (PyTuple_CheckExact(a)) {
if (PyTuple_Check(a)) {
// compare tuple subclasses as builtin tuples
return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b);
}
// frozenset isn't yet supported
Expand Down Expand Up @@ -311,7 +312,8 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) {
// because complex(k,0) == k holds for any int-object k
// and kh_complex128_hash_func doesn't respect it
hash = complexobject_hash((PyComplexObject *)key);
} else if (PyTuple_CheckExact(key)) {
} else if (PyTuple_Check(key)) {
// hash tuple subclasses as builtin tuples
hash = tupleobject_hash((PyTupleObject *)key);
} else {
hash = PyObject_Hash(key);
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import namedtuple
from datetime import timedelta
import re

Expand Down Expand Up @@ -1006,3 +1007,26 @@ def test_get_indexer_for_multiindex_with_nans(nulls_fixture):
result = idx1.get_indexer(idx2)
expected = np.array([-1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)


def test_get_loc_namedtuple_behaves_like_tuple():
# GH57922
NamedIndex = namedtuple("NamedIndex", ("a", "b"))
multi_idx = MultiIndex.from_tuples(
[NamedIndex("i1", "i2"), NamedIndex("i3", "i4"), NamedIndex("i5", "i6")]
)
for idx in (multi_idx, multi_idx.to_flat_index()):
assert idx.get_loc(NamedIndex("i1", "i2")) == 0
assert idx.get_loc(NamedIndex("i3", "i4")) == 1
assert idx.get_loc(NamedIndex("i5", "i6")) == 2
assert idx.get_loc(("i1", "i2")) == 0
assert idx.get_loc(("i3", "i4")) == 1
assert idx.get_loc(("i5", "i6")) == 2
multi_idx = MultiIndex.from_tuples([("i1", "i2"), ("i3", "i4"), ("i5", "i6")])
for idx in (multi_idx, multi_idx.to_flat_index()):
assert idx.get_loc(NamedIndex("i1", "i2")) == 0
assert idx.get_loc(NamedIndex("i3", "i4")) == 1
assert idx.get_loc(NamedIndex("i5", "i6")) == 2
assert idx.get_loc(("i1", "i2")) == 0
assert idx.get_loc(("i3", "i4")) == 1
assert idx.get_loc(("i5", "i6")) == 2
46 changes: 40 additions & 6 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import namedtuple
from collections.abc import Generator
from contextlib import contextmanager
import re
Expand Down Expand Up @@ -405,9 +406,8 @@ def test_nan_complex_real(self):
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
with pytest.raises(KeyError, match=re.escape(repr(other))):
table.get_item(other)
assert str(error.value) == str(other)

def test_nan_complex_imag(self):
nan1 = complex(1, float("nan"))
Expand All @@ -417,9 +417,8 @@ def test_nan_complex_imag(self):
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
with pytest.raises(KeyError, match=re.escape(repr(other))):
table.get_item(other)
assert str(error.value) == str(other)

def test_nan_in_tuple(self):
nan1 = (float("nan"),)
Expand All @@ -436,9 +435,28 @@ def test_nan_in_nested_tuple(self):
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
with pytest.raises(KeyError, match=re.escape(repr(other))):
table.get_item(other)

def test_nan_in_namedtuple(self):
T = namedtuple("T", ["x"])
nan1 = T(float("nan"))
nan2 = T(float("nan"))
assert nan1.x is not nan2.x
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42

def test_nan_in_nested_namedtuple(self):
T = namedtuple("T", ["x", "y"])
nan1 = T(1, (2, (float("nan"),)))
nan2 = T(1, (2, (float("nan"),)))
other = T(1, 2)
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=re.escape(repr(other))):
table.get_item(other)
assert str(error.value) == str(other)


def test_hash_equal_tuple_with_nans():
Expand All @@ -448,6 +466,22 @@ def test_hash_equal_tuple_with_nans():
assert ht.objects_are_equal(a, b)


def test_hash_equal_namedtuple_with_nans():
T = namedtuple("T", ["x", "y"])
a = T(float("nan"), (float("nan"), float("nan")))
b = T(float("nan"), (float("nan"), float("nan")))
assert ht.object_hash(a) == ht.object_hash(b)
assert ht.objects_are_equal(a, b)


def test_hash_equal_namedtuple_and_tuple():
T = namedtuple("T", ["x", "y"])
a = T(1, (2, 3))
b = (1, (2, 3))
assert ht.object_hash(a) == ht.object_hash(b)
assert ht.objects_are_equal(a, b)


def test_get_labels_groupby_for_Int64(writable):
table = ht.Int64HashTable()
vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)
Expand Down