Skip to content

ENH: add per axis, per level indexing with tuples using loc #6134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 200 additions & 10 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ def _get_label(self, label, axis=0):
return self.obj[label]
elif (isinstance(label, tuple) and
isinstance(label[axis], slice)):

raise IndexingError('no slices here')
raise IndexingError('no slices here, handle elsewhere')

try:
return self.obj._xs(label, axis=axis, copy=False)
Expand Down Expand Up @@ -677,24 +676,32 @@ def _getitem_lowerdim(self, tup):
# a bit kludgy
if isinstance(ax0, MultiIndex):
try:
# fast path for series or for tup devoid of slices
return self._get_label(tup, axis=0)
except TypeError:
# slices are unhashable
pass
except Exception as e1:
if isinstance(tup[0], (slice, Index)):
raise IndexingError
raise IndexingError("Handle elsewhere")

# raise the error if we are not sorted
if not ax0.is_lexsorted_for_tuple(tup):
raise e1
try:
loc = ax0.get_loc(tup[0])
except KeyError:
raise e1

# GH911 introduced this clause, but the regression test
# added for it now passes even without it. Let's rock the boat.
# 2014/01/27

# # should we abort, or keep going?
# try:
# loc = ax0.get_loc(tup[0])
# except KeyError:
# raise e1


if len(tup) > self.obj.ndim:
raise IndexingError
raise IndexingError("Too many indexers. handle elsewhere")

# to avoid wasted computation
# df.ix[d1:d2, 0] -> columns first (True)
Expand All @@ -707,9 +714,9 @@ def _getitem_lowerdim(self, tup):
if not _is_list_like(section):
return section

# might have been a MultiIndex
elif section.ndim == self.ndim:

# we're in the middle of slicing through a MultiIndex
# revise the key wrt to `section` by inserting an _NS
new_key = tup[:i] + (_NS,) + tup[i + 1:]

else:
Expand All @@ -725,6 +732,7 @@ def _getitem_lowerdim(self, tup):
if len(new_key) == 1:
new_key, = new_key

# This is an elided recursive call to iloc/loc/etc'
return getattr(section, self.name)[new_key]

raise IndexingError('not applicable')
Expand Down Expand Up @@ -1148,6 +1156,14 @@ def _getitem_axis(self, key, axis=0):
raise ValueError('Cannot index with multidimensional key')

return self._getitem_iterable(key, axis=axis)
elif isinstance(key, tuple) and isinstance(labels, MultiIndex) and \
any([isinstance(x,slice) for x in key]):
# handle per-axis tuple containting label criteria for
# each level (or a prefix of levels), may contain
# (None) slices, list of labels or labels
specs = _tuple_to_mi_locs(labels,key)
g = _spec_to_array_indices(labels, specs)
return self.obj.iloc[g]
else:
self._has_valid_type(key, axis)
return self._get_label(key, axis=axis)
Expand Down Expand Up @@ -1511,3 +1527,177 @@ def _maybe_droplevels(index, key):
pass

return index

def _tuple_to_mi_locs(ix,tup):
"""Convert a tuple of slices/label lists/labels to a level-wise spec

Parameters
----------
ix: a sufficiently lexsorted, unique/non-dupe MultIindex.
tup: a tuple of slices, labels or lists of labels.
slice(None) is acceptable, and the case of len(tup)<ix.nlevels
will have labels from trailing levels included.

Returns
-------
a list containing ix.nlevels elements of either:
- 2-tuple representing a (start,stop) slice
or
- a list of label positions.

The positions are relative to the labels of the corresponding level, not to
the entire unrolled index.

Example (This is *not* a doctest):
>>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']])
>>> for x in mi.get_values(): print(x)
('A0', 'B0')
('A0', 'B1')
('A1', 'B0')
('A1', 'B1')
('A2', 'B0')
('A2', 'B1')
>>> _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1']))
[(0, 2), [0, 1]]

read as:
- All labels in position [0,1) in first level
- for each of those, all labels at positions 0 or 1.

The same effective result can be achieved by specifying the None Slice,
or omitting it completely. Note the tuple (0,2) has replaced the list [0 1],
but the outcome is the same.

>>> _tuple_to_mi_locs(mi,(slice('A0','A2'),slice(None)))
[(0, 2), (0,2)]

>>> _tuple_to_mi_locs(mi,(slice('A0','A2'),))
[(0, 2), (0,2)]

"""


ranges = []

# ix must be lexsorted to at least as many levels
# as there are elements in `tup`
assert ix.is_lexsorted_for_tuple(tup)
assert ix.is_unique
assert isinstance(ix,MultiIndex)

for i,k in enumerate(tup):
level = ix.levels[i]

if _is_list_like(k):
# a collection of labels to include from this level
ranges.append([level.get_loc(x) for x in k])
continue
if k == slice(None):
start = 0
stop = len(level)
elif isinstance(k,slice):
start = level.get_loc(k.start)
stop = len(level)
if k.stop:
stop = level.get_loc(k.stop)
else:
# a single label
start = level.get_loc(k)
stop = start

ranges.append((start,stop))

for i in range(i+1,len(ix.levels)):
# omitting trailing dims
# means include all values
level = ix.levels[i]
start = 0
stop = len(level)
ranges.append((start,stop))

return ranges

def _spec_to_array_indices(ix, specs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems like a special case of core/index.py/MultiIndex/_get_loc_level which is a big monster....

your routine nicely encapsulates things and is pretty...should maybe be a method of MultiIndex? (in the future maybe integrate with _get_loc_level, but I think that has lots of special cases

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw some terrible things in the indexing code. It's practically unreadabele. I fear
that by plugging it in there, we'll remain stuck where we only get a soul brave enough
to improve indexing things once every 18 months. or you have to do it.

_get_loc_level returns a slice right? If I'm not wrong, it returns a slice relative to the
whole array, rather then per level, which I find easier to grok code wise,
and aides the lazy index generation later on. Not that we have a lazy consumer to plug it into.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, how about a compromise. Maybe something like this:

class MultiIndexSlicer(object):

    def __init__(self, labels):
              self.labels = labels

     def get_indexer(self, key):
             # this basically calls your 2 functions
             # but keeps the specs internal
             # also, I think allows one to do _get_loc_level as a separate method call (when this gets refactored
indexer = MultiIndexSlicer(labels).get_indexer(key)
return self.obj.iloc[indexer]

(could also pas obj and axis so this MIS object would have more state...

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That looks fine. It's a style thing, but I prefer self-contained functions that make
good building blocks over monster objects.

"""Convert a tuple of slices/label lists/labels to a level-wise spec

Parameters
----------
ix: a sufficiently lexsorted, unique/non-dupe MultIindex.
specs: a list of 2-tuples/list of label positions. Specifically, The
output of _tuple_to_mi_locs.
len(specs) must matc ix.nlevels.

Returns
-------
a generator of row positions relative to ix, corresponding to specs.
Suitable for usage with `iloc`.

Example (This is *not* a doctest):
>>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']])
>>> for x in mi.get_values(): print(x)
('A0', 'B0')
('A0', 'B1')
('A1', 'B0')
('A1', 'B1')
('A2', 'B0')
('A2', 'B1')

>>> specs = _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1']))
>>> list(_spec_to_array_indices(mi, specs))
[0, 1, 2, 3]

Which are all the labels having 'A0' to 'A2' (non-inclusive) at level=0
and 'B0' or 'B1' at level = 0

"""
assert ix.is_lexsorted_for_tuple(specs)
assert len(specs) == ix.nlevels
assert ix.is_unique
assert isinstance(ix,MultiIndex)

# step size/increment for iteration at each level
giant_steps = np.cumprod(ix.levshape[::-1])[::-1]
giant_steps[:-1] = giant_steps[1:]
giant_steps[-1] = 1

def _iter_vectorize(specs, i=0):
step_size = giant_steps[i]
spec=specs[i]
if isinstance(spec,tuple):
# tuples are 2-tuples of (start,stop) label indices to include
valrange = compat.range(*spec)
elif isinstance(spec,list):
# lists are discrete label indicies to include
valrange = spec

if len(specs)-1 == i:
return np.array(valrange)
else:
tmpl = np.array([v for v in _iter_vectorize(specs,i+1)])
res=np.tile(tmpl,(len(valrange),1))
steps=(np.array(valrange)*step_size).reshape((len(valrange),1))
return (res+steps).flatten()


def _iter_generator(specs, i=0):
step_size = giant_steps[i]
spec=specs[i]
if isinstance(spec,tuple):
# tuples are 2-tuples of (start,stop) label indices to include
valrange = compat.range(*spec)
elif isinstance(spec,list):
# lists are discrete label indicies to include
valrange = spec

if len(specs)-1 == i:
# base case
for v in valrange:
yield v
else:
for base in valrange:
base *= step_size
for v in _iter_generator(specs,i+1):
yield base + v
# validate

return _iter_vectorize(specs)