Skip to content

Commit cfe5cb9

Browse files
committed
BUG: implement new engine for codes-based MultiIndex indexing
closes pandas-dev#18519 closes pandas-dev#18818 closes pandas-dev#18520 closes pandas-dev#18485 closes pandas-dev#15994 closes pandas-dev#19086
1 parent 9303315 commit cfe5cb9

File tree

3 files changed

+112
-10
lines changed

3 files changed

+112
-10
lines changed

doc/source/whatsnew/v0.23.0.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,10 @@ Indexing
318318
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
319319
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
320320
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
321+
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
322+
- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
323+
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing NaN (:issue:`18485`)
324+
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex` which would fail when levels had different dtypes (:issue:`18520`)
321325
- Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`)
322326
- Bug in :func:`Index.drop` when passing a list of both tuples and non-tuples (:issue:`18304`)
323327
- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)

pandas/_libs/index.pyx

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ from hashtable cimport HashTable
2020
from pandas._libs import algos, hashtable as _hash
2121
from pandas._libs.tslibs import period as periodlib
2222
from pandas._libs.tslib import Timestamp, Timedelta
23+
from pandas._libs.missing import checknull
2324
from datetime import datetime, timedelta, date
2425

2526
from cpython cimport PyTuple_Check, PyList_Check
@@ -599,6 +600,74 @@ cpdef convert_scalar(ndarray arr, object value):
599600
return value
600601

601602

603+
cdef class BaseMultiIndexCodesEngine(object):
604+
def __init__(self, levels, labels, offsets, **kwargs):
605+
self._levels = levels
606+
self._offsets = offsets
607+
608+
# Map each combination to an integer
609+
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64')
610+
lab_ints = self._codes_to_ints(codes)
611+
612+
# Initialize underlying index
613+
self._base.__init__(self, lambda: lab_ints, len(lab_ints), **kwargs)
614+
615+
def get_indexer(self, target, method=None, limit=None):
616+
level_codes = [self._levels[lev].get_indexer(codes, method=method) + 1
617+
for lev, codes in enumerate(zip(*target))]
618+
619+
keys_int = self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
620+
621+
if method is not None:
622+
# keys must be sorted - the engine already is
623+
order = np.argsort(keys_int)
624+
keys_int = keys_int[order]
625+
sup_meth = getattr(self._base, 'get_{}_indexer'.format(method))
626+
indexer = sup_meth(self, keys_int, limit=limit)
627+
indexer = indexer[order]
628+
else:
629+
indexer = self._base.get_indexer(self, keys_int)
630+
631+
return indexer
632+
633+
def get_loc(self, key):
634+
if is_definitely_invalid_key(key):
635+
raise TypeError("'{key}' is an invalid key".format(key=key))
636+
if not isinstance(key, tuple):
637+
raise KeyError(key)
638+
try:
639+
idces = [0 if checknull(v) else self._levels[l].get_loc(v) + 1
640+
for l, v in enumerate(key)]
641+
except KeyError:
642+
raise KeyError(key)
643+
idces = np.array(idces, ndmin=2, dtype='uint64')
644+
645+
key_int = self._codes_to_ints(idces)[0]
646+
647+
return self._base.get_loc(self, key_int)
648+
649+
def get_indexer_non_unique(self, target):
650+
# This needs to be overridden just because the default one works on
651+
# target._values, and target can be itself a MultiIndex.
652+
653+
level_codes = [self._levels[lev].get_indexer(codes) + 1
654+
for lev, codes in enumerate(zip(*target))]
655+
codes = np.array(level_codes, dtype='uint64').T
656+
keys_int = self._codes_to_ints(codes)
657+
658+
indexer = self._base.get_indexer_non_unique(self, keys_int)
659+
660+
return indexer
661+
662+
def __contains__(self, object val):
663+
try:
664+
self.get_loc(val)
665+
return True
666+
except (KeyError, TypeError, ValueError):
667+
return False
668+
669+
670+
602671
cdef class MultiIndexObjectEngine(ObjectEngine):
603672
"""
604673
provide the same interface as the MultiIndexEngine

pandas/core/indexes/multi.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,36 @@
5050
target_klass='MultiIndex or list of tuples'))
5151

5252

53+
class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
54+
libindex.UInt64Engine):
55+
"""
56+
Manage a MultiIndex by mapping label combinations to positive integers.
57+
"""
58+
_base = libindex.UInt64Engine
59+
60+
def _codes_to_ints(self, codes):
61+
# Shift:
62+
codes <<= self._offsets
63+
# Now sum and OR are in fact interchangeable:
64+
return np.bitwise_or.reduce(codes, axis=1)
65+
66+
67+
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
68+
libindex.ObjectEngine):
69+
"""
70+
In those (extreme) cases in which the number of possible label combinations
71+
overflows the 64 bits integers, use an ObjectEngine containing Python
72+
integers.
73+
"""
74+
_base = libindex.ObjectEngine
75+
76+
def _codes_to_ints(self, codes):
77+
# Shift:
78+
codes = codes.astype('object') << self._offsets
79+
# Now sum and OR are in fact interchangeable:
80+
return np.bitwise_or.reduce(codes, axis=1)
81+
82+
5383
class MultiIndex(Index):
5484
"""
5585
A multi-level, or hierarchical, index object for pandas objects
@@ -691,16 +721,15 @@ def _get_level_number(self, level):
691721

692722
@cache_readonly
693723
def _engine(self):
724+
# Find powers of 2 which dominate level sizes - including -1 for NaN:
725+
lev_bits = np.cumsum(np.ceil(np.log2([len(l) + 1 for l in
726+
self.levels[::-1]])))[::-1]
727+
offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint')
694728

695-
# choose our engine based on our size
696-
# the hashing based MultiIndex for larger
697-
# sizes, and the MultiIndexOjbect for smaller
698-
# xref: https://github.com/pandas-dev/pandas/pull/16324
699-
l = len(self)
700-
if l > 10000:
701-
return libindex.MultiIndexHashEngine(lambda: self, l)
702-
703-
return libindex.MultiIndexObjectEngine(lambda: self.values, l)
729+
if lev_bits[0] > 64:
730+
# The levels would overflow a 64 bit integer - use Python integers:
731+
return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
732+
return MultiIndexUIntEngine(self.levels, self.labels, offsets)
704733

705734
@property
706735
def values(self):
@@ -1889,7 +1918,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
18891918
if tolerance is not None:
18901919
raise NotImplementedError("tolerance not implemented yet "
18911920
'for MultiIndex')
1892-
indexer = self._get_fill_indexer(target, method, limit)
1921+
indexer = self._engine.get_indexer(target, method, limit)
18931922
elif method == 'nearest':
18941923
raise NotImplementedError("method='nearest' not implemented yet "
18951924
'for MultiIndex; see GitHub issue 9365')

0 commit comments

Comments
 (0)