Skip to content

reindex multi-index at level with reordered labels #9019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 7, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ Bug Fixes
- Bug in ``BlockManager`` where setting values with different type would break block integrity (:issue:`8850`)
- Bug in ``DatetimeIndex`` when using ``time`` object as key (:issue:`8667`)
- Bug in ``merge`` where ``how='left'`` and ``sort=False`` would not preserve left frame order (:issue:`7331`)
- Bug in ``MultiIndex.reindex`` where reindexing at level would not reorder labels (:issue:`4088`)

- Fix negative step support for label-based slices (:issue:`8753`)

Expand Down
97 changes: 81 additions & 16 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1828,13 +1828,41 @@ def _join_non_unique(self, other, how='left', return_indexers=False):
else:
return join_index

def _join_level(self, other, level, how='left', return_indexers=False):
def _join_level(self, other, level, how='left',
return_indexers=False,
keep_order=True):
"""
The join method *only* affects the level of the resulting
MultiIndex. Otherwise it just exactly aligns the Index data to the
labels of the level in the MultiIndex. The order of the data indexed by
the MultiIndex will not be changed (currently)
"""
labels of the level in the MultiIndex. If `keep_order` == True, the
order of the data indexed by the MultiIndex will not be changed;
otherwise, it will tie out with `other`.
"""
from pandas.algos import groupsort_indexer

def _get_leaf_sorter(labels):
'''
returns sorter for the inner most level while preserving the
order of higher levels
'''
if labels[0].size == 0:
return np.empty(0, dtype='int64')

if len(labels) == 1:
lab = com._ensure_int64(labels[0])
sorter, _ = groupsort_indexer(lab, 1 + lab.max())
return sorter

# find indexers of begining of each set of
# same-key labels w.r.t all but last level
tic = labels[0][:-1] != labels[0][1:]
for lab in labels[1:-1]:
tic |= lab[:-1] != lab[1:]

starts = np.hstack(([True], tic, [True])).nonzero()[0]
lab = com._ensure_int64(labels[-1])
return lib.get_level_sorter(lab, starts)

if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
raise TypeError('Join on level between two MultiIndex objects '
'is ambiguous')
Expand All @@ -1849,33 +1877,69 @@ def _join_level(self, other, level, how='left', return_indexers=False):
level = left._get_level_number(level)
old_level = left.levels[level]

if not right.is_unique:
raise NotImplementedError('Index._join_level on non-unique index '
'is not implemented')

new_level, left_lev_indexer, right_lev_indexer = \
old_level.join(right, how=how, return_indexers=True)

if left_lev_indexer is not None:
if left_lev_indexer is None:
if keep_order or len(left) == 0:
left_indexer = None
join_index = left
else: # sort the leaves
left_indexer = _get_leaf_sorter(left.labels[:level + 1])
join_index = left[left_indexer]

else:
left_lev_indexer = com._ensure_int64(left_lev_indexer)
rev_indexer = lib.get_reverse_indexer(left_lev_indexer,
len(old_level))

new_lev_labels = com.take_nd(rev_indexer, left.labels[level],
allow_fill=False)
omit_mask = new_lev_labels != -1

new_labels = list(left.labels)
new_labels[level] = new_lev_labels

if not omit_mask.all():
new_labels = [lab[omit_mask] for lab in new_labels]

new_levels = list(left.levels)
new_levels[level] = new_level

join_index = MultiIndex(levels=new_levels, labels=new_labels,
names=left.names, verify_integrity=False)
left_indexer = np.arange(len(left))[new_lev_labels != -1]
else:
join_index = left
left_indexer = None
if keep_order: # just drop missing values. o.w. keep order
left_indexer = np.arange(len(left))
mask = new_lev_labels != -1
if not mask.all():
new_labels = [lab[mask] for lab in new_labels]
left_indexer = left_indexer[mask]

else: # tie out the order with other
if level == 0: # outer most level, take the fast route
ngroups = 1 + new_lev_labels.max()
left_indexer, counts = groupsort_indexer(new_lev_labels,
ngroups)
# missing values are placed first; drop them!
left_indexer = left_indexer[counts[0]:]
new_labels = [lab[left_indexer] for lab in new_labels]

else: # sort the leaves
mask = new_lev_labels != -1
mask_all = mask.all()
if not mask_all:
new_labels = [lab[mask] for lab in new_labels]

left_indexer = _get_leaf_sorter(new_labels[:level + 1])
new_labels = [lab[left_indexer] for lab in new_labels]

# left_indexers are w.r.t masked frame.
# reverse to original frame!
if not mask_all:
left_indexer = mask.nonzero()[0][left_indexer]

join_index = MultiIndex(levels=new_levels,
labels=new_labels,
names=left.names,
verify_integrity=False)

if right_lev_indexer is not None:
right_indexer = com.take_nd(right_lev_indexer,
Expand Down Expand Up @@ -3925,7 +3989,8 @@ def reindex(self, target, method=None, level=None, limit=None):
else:
target = _ensure_index(target)
target, indexer, _ = self._join_level(target, level, how='right',
return_indexers=True)
return_indexers=True,
keep_order=False)
else:
if self.equals(target):
indexer = None
Expand Down
21 changes: 21 additions & 0 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,27 @@ def row_bool_subset_object(ndarray[object, ndim=2] values,

return out

@cython.boundscheck(False)
@cython.wraparound(False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, something like this doesn't already exist in groupy.py (e.g. in the sortlevel section). (doesn't look like it via casual glance).

def get_level_sorter(ndarray[int64_t, ndim=1] label,
ndarray[int64_t, ndim=1] starts):
"""
argsort for a single level of a multi-index, keeping the order of higher
levels unchanged. `starts` points to starts of same-key indices w.r.t
to leading levels; equivalent to:
np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort')
+ starts[i] for i in range(len(starts) - 1)])
"""
cdef:
int64_t l, r
Py_ssize_t i
ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64)

for i in range(len(starts) - 1):
l, r = starts[i], starts[i + 1]
out[l:r] = l + label[l:r].argsort(kind='mergesort')

return out

def group_count(ndarray[int64_t] values, Py_ssize_t size):
cdef:
Expand Down
60 changes: 60 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1897,6 +1897,66 @@ def test_reversed_reindex_ffill_raises(self):
self.assertRaises(ValueError, df.reindex, dr[::-1], method='ffill')
self.assertRaises(ValueError, df.reindex, dr[::-1], method='bfill')

def test_reindex_level(self):
from itertools import permutations
icol = ['jim', 'joe', 'jolie']

def verify_first_level(df, level, idx):
f = lambda val: np.nonzero(df[level] == val)[0]
i = np.concatenate(list(map(f, idx)))
left = df.set_index(icol).reindex(idx, level=level)
right = df.iloc[i].set_index(icol)
assert_frame_equal(left, right)

def verify(df, level, idx, indexer):
left = df.set_index(icol).reindex(idx, level=level)
right = df.iloc[indexer].set_index(icol)
assert_frame_equal(left, right)

df = pd.DataFrame({'jim':list('B' * 4 + 'A' * 2 + 'C' * 3),
'joe':list('abcdeabcd')[::-1],
'jolie':[10, 20, 30] * 3,
'joline': np.random.randint(0, 1000, 9)})

target = [['C', 'B', 'A'], ['F', 'C', 'A', 'D'], ['A'], ['D', 'F'],
['A', 'B', 'C'], ['C', 'A', 'B'], ['C', 'B'], ['C', 'A'],
['A', 'B'], ['B', 'A', 'C'], ['A', 'C', 'B']]

for idx in target:
verify_first_level(df, 'jim', idx)

verify(df, 'joe', list('abcde'), [3, 2, 1, 0, 5, 4, 8, 7, 6])
verify(df, 'joe', list('abcd'), [3, 2, 1, 0, 5, 8, 7, 6])
verify(df, 'joe', list('abc'), [3, 2, 1, 8, 7, 6])
verify(df, 'joe', list('eca'), [1, 3, 4, 6, 8])
verify(df, 'joe', list('edc'), [0, 1, 4, 5, 6])
verify(df, 'joe', list('eadbc'), [3, 0, 2, 1, 4, 5, 8, 7, 6])
verify(df, 'joe', list('edwq'), [0, 4, 5])
verify(df, 'joe', list('wq'), [])

df = DataFrame({'jim':['mid'] * 5 + ['btm'] * 8 + ['top'] * 7,
'joe':['3rd'] * 2 + ['1st'] * 3 + ['2nd'] * 3 +
['1st'] * 2 + ['3rd'] * 3 + ['1st'] * 2 +
['3rd'] * 3 + ['2nd'] * 2,
'jolie':np.random.randint(0, 1000, 20),
'joline': np.random.randn(20).round(3) * 10})

for idx in permutations(df['jim'].unique()):
for i in range(3):
verify_first_level(df, 'jim', idx[:i+1])

i = [2,3,4,0,1,8,9,5,6,7,10,11,12,13,14,18,19,15,16,17]
verify(df, 'joe', ['1st', '2nd', '3rd'], i)

i = [0,1,2,3,4,10,11,12,5,6,7,8,9,15,16,17,18,19,13,14]
verify(df, 'joe', ['3rd', '2nd', '1st'], i)

i = [0,1,5,6,7,10,11,12,18,19,15,16,17]
verify(df, 'joe', ['2nd', '3rd'], i)

i = [0,1,2,3,4,10,11,12,8,9,15,16,17,13,14]
verify(df, 'joe', ['3rd', '1st'], i)

def test_getitem_ix_float_duplicates(self):
df = pd.DataFrame(np.random.randn(3, 3),
index=[0.1, 0.2, 0.2], columns=list('abc'))
Expand Down