diff --git a/RELEASE.rst b/RELEASE.rst index f3fb98535cb61..38298fde12ff0 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -61,8 +61,21 @@ pandas 0.11.1 - Fix regression in a DataFrame apply with axis=1, objects were not being converted back to base dtypes correctly (GH3480_) - Fix issue when storing uint dtypes in an HDFStore. (GH3493_) + - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_) + - ref_locs support to allow duplicative indices across dtypes (GH3468_) + - Non-unique index support clarified (GH3468_) + + - Fix assigning a new index to a duplicate index in a DataFrame would fail + - Fix construction of a DataFrame with a duplicate index + - ref_locs support to allow duplicative indices across dtypes + (GH2194_) + - applymap on a DataFrame with a non-unique index now works + (removed warning) (GH2786_), and fix (GH3230_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 +.. _GH2786: https://github.com/pydata/pandas/issues/2786 +.. _GH2194: https://github.com/pydata/pandas/issues/2194 +.. _GH3230: https://github.com/pydata/pandas/issues/3230 .. _GH3251: https://github.com/pydata/pandas/issues/3251 .. _GH3379: https://github.com/pydata/pandas/issues/3379 .. _GH3480: https://github.com/pydata/pandas/issues/3480 @@ -75,6 +88,7 @@ pandas 0.11.1 .. _GH3455: https://github.com/pydata/pandas/issues/3455 .. _GH3457: https://github.com/pydata/pandas/issues/3457 .. _GH3461: https://github.com/pydata/pandas/issues/3461 +.. _GH3468: https://github.com/pydata/pandas/issues/3468 .. _GH3448: https://github.com/pydata/pandas/issues/3448 .. _GH3449: https://github.com/pydata/pandas/issues/3449 .. _GH3493: https://github.com/pydata/pandas/issues/3493 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2cb7608c7aba6..8bfdee3b75170 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4261,9 +4261,6 @@ def infer(x): if com.is_datetime64_dtype(x): x = lib.map_infer(x, lib.Timestamp) return lib.map_infer(x, func) - #GH2786 - if not self.columns.is_unique: - raise ValueError("applymap does not support dataframes having duplicate column labels") return self.apply(infer) #---------------------------------------------------------------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 03cfd18f5afe5..c874b061dd63d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -61,6 +61,7 @@ def ref_locs(self): if (indexer == -1).any(): raise AssertionError('Some block items were not in block ' 'ref_items') + self._ref_locs = indexer return self._ref_locs @@ -164,6 +165,9 @@ def get(self, item): loc = self.items.get_loc(item) return self.values[loc] + def iget(self, i): + return self.values[i] + def set(self, item, value): """ Modify Block in-place with new item value @@ -710,7 +714,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True): # attempt to create new type blocks blocks = [] for i, c in enumerate(self.items): - values = self.get(c) + values = self.iget(i) values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric) values = _block_shape(values) @@ -879,7 +883,7 @@ class BlockManager(object): ----- This is *not* a public API class """ - __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated'] + __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs'] def __init__(self, blocks, axes, do_integrity_check=True): self.axes = [_ensure_index(ax) for ax in axes] @@ -915,12 +919,82 @@ def set_axis(self, axis, value): if len(value) != len(cur_axis): raise Exception('Length mismatch (%d vs %d)' % (len(value), len(cur_axis))) + self.axes[axis] = value if axis == 0: + + # we have a non-unique index, so setup the ref_locs + if not cur_axis.is_unique: + self.set_ref_locs(cur_axis) + + # take via ref_locs for block in self.blocks: block.set_ref_items(self.items, maybe_rename=True) + def set_ref_locs(self, labels = None): + # if we have a non-unique index on this axis, set the indexers + # we need to set an absolute indexer for the blocks + # return the indexer if we are not unique + if labels is None: + labels = self.items + + if labels.is_unique: + return None + + #### THIS IS POTENTIALLY VERY SLOW ##### + + # if we are already computed, then we are done + rl = getattr(self,'_ref_locs',None) + if rl is not None: + return rl + + blocks = self.blocks + + # initialize + blockmap = dict() + for b in blocks: + arr = np.empty(len(b.items),dtype='int64') + arr.fill(-1) + b._ref_locs = arr + + # add this block to the blockmap for each + # of the items in the block + for item in b.items: + if item not in blockmap: + blockmap[item] = [] + blockmap[item].append(b) + + rl = np.empty(len(labels),dtype=object) + for i, item in enumerate(labels.values): + + try: + block = blockmap[item].pop(0) + except: + raise Exception("not enough items in set_ref_locs") + + indexer = np.arange(len(block.items)) + mask = (block.items == item) & (block._ref_locs == -1) + if not mask.any(): + + # this case will catch a comparison of a index of tuples + mask = np.empty(len(block.items),dtype=bool) + mask.fill(False) + for j, (bitem, brl) in enumerate(zip(block.items,block._ref_locs)): + mask[j] = bitem == item and brl == -1 + + indices = indexer[mask] + if len(indices): + idx = indices[0] + else: + raise Exception("already set too many items in set_ref_locs") + + block._ref_locs[idx] = i + rl[i] = (block,idx) + + self._ref_locs = rl + return rl + # make items read only for now def _get_items(self): return self.axes[0] @@ -1387,26 +1461,11 @@ def iget(self, i): item = self.items[i] if self.items.is_unique: return self.get(item) - else: - # ugh - try: - inds, = (self.items == item).nonzero() - except AttributeError: # MultiIndex - inds, = self.items.map(lambda x: x == item).nonzero() - - _, block = self._find_block(item) - - try: - binds, = (block.items == item).nonzero() - except AttributeError: # MultiIndex - binds, = block.items.map(lambda x: x == item).nonzero() - for j, (k, b) in enumerate(zip(inds, binds)): - if i == k: - return block.values[b] - - raise Exception('Cannot have duplicate column names ' - 'split across dtypes') + # compute the duplicative indexer if needed + ref_locs = self.set_ref_locs() + b, loc = ref_locs[i] + return b.values[loc] def get_scalar(self, tup): """ @@ -1582,6 +1641,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): # keep track of what items aren't found anywhere mask = np.zeros(len(item_order), dtype=bool) + new_axes = [new_items] + self.axes[1:] + new_blocks = [] for blk in self.blocks: blk_indexer = blk.items.get_indexer(item_order) @@ -1605,7 +1666,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) - return BlockManager(new_blocks, [new_items] + self.axes[1:]) + return BlockManager(new_blocks, new_axes) def reindex_items(self, new_items, copy=True, fill_value=np.nan): """ @@ -1619,6 +1680,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): # TODO: this part could be faster (!) new_items, indexer = self.items.reindex(new_items) + new_axes = [new_items] + self.axes[1:] # could have so me pathological (MultiIndex) issues here new_blocks = [] @@ -1643,7 +1705,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) - return BlockManager(new_blocks, [new_items] + self.axes[1:]) + return BlockManager(new_blocks, new_axes) def _make_na_block(self, items, ref_items, fill_value=np.nan): # TODO: infer dtypes other than float64 from fill_value @@ -1685,11 +1747,11 @@ def merge(self, other, lsuffix=None, rsuffix=None): this, other = self._maybe_rename_join(other, lsuffix, rsuffix) cons_items = this.items + other.items - consolidated = _consolidate(this.blocks + other.blocks, cons_items) - new_axes = list(this.axes) new_axes[0] = cons_items + consolidated = _consolidate(this.blocks + other.blocks, cons_items) + return BlockManager(consolidated, new_axes) def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True): @@ -1902,7 +1964,6 @@ def form_blocks(arrays, names, axes): na_block = make_block(block_values, extra_items, items) blocks.append(na_block) - blocks = _consolidate(blocks, items) return blocks @@ -1953,9 +2014,6 @@ def _shape_compat(x): names, arrays = zip(*tuples) - # index may box values - items = ref_items[ref_items.isin(names)] - first = arrays[0] shape = (len(arrays),) + _shape_compat(first) @@ -1963,6 +2021,14 @@ def _shape_compat(x): for i, arr in enumerate(arrays): stacked[i] = _asarray_compat(arr) + # index may box values + if ref_items.is_unique: + items = ref_items[ref_items.isin(names)] + else: + items = _ensure_index([ n for n in names if n in ref_items ]) + if len(items) != len(stacked): + raise Exception("invalid names passed _stack_arrays") + return items, stacked diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7bafed216b9b9..cb3799c28d0cf 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7492,12 +7492,15 @@ def test_applymap(self): self.assert_(result.dtypes[0] == object) # GH2786 - df = DataFrame(np.random.random((3,4))) - df.columns = ['a','a','a','a'] - try: - df.applymap(str) - except ValueError as e: - self.assertTrue("support" in str(e)) + df = DataFrame(np.random.random((3,4))) + df2 = df.copy() + cols = ['a','a','a','a'] + df.columns = cols + + expected = df2.applymap(str) + expected.columns = cols + result = df.applymap(str) + assert_frame_equal(result,expected) def test_filter(self): # items @@ -9201,6 +9204,62 @@ def test_assign_columns(self): assert_series_equal(self.frame['C'], frame['baz']) assert_series_equal(self.frame['hi'], frame['foo2']) + def test_columns_with_dups(self): + + # GH 3468 related + + # basic + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['a','a.1'] + str(df) + expected = DataFrame([[1,2]], columns=['a','a.1']) + assert_frame_equal(df, expected) + + df = DataFrame([[1,2,3]], columns=['b','a','a']) + df.columns = ['b','a','a.1'] + str(df) + expected = DataFrame([[1,2,3]], columns=['b','a','a.1']) + assert_frame_equal(df, expected) + + # with a dup index + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['b','b'] + str(df) + expected = DataFrame([[1,2]], columns=['b','b']) + assert_frame_equal(df, expected) + + # multi-dtype + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c']) + df.columns = list('ABCDEFG') + str(df) + expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG')) + assert_frame_equal(df, expected) + + # this is an error because we cannot disambiguate the dup columns + self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a'])) + + # dups across blocks + df_float = DataFrame(np.random.randn(10, 3),dtype='float64') + df_int = DataFrame(np.random.randn(10, 3),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) + df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + result = df._data.set_ref_locs() + self.assert_(len(result) == len(df.columns)) + + # testing iget + for i in range(len(df.columns)): + df.iloc[:,i] + + # dup columns across dtype GH 2079/2194 + vals = [[1, -1, 2.], [2, -2, 3.]] + rs = DataFrame(vals, columns=['A', 'A', 'B']) + xp = DataFrame(vals) + xp.columns = ['A', 'A', 'B'] + assert_frame_equal(rs, xp) + def test_cast_internals(self): casted = DataFrame(self.frame._data, dtype=int) expected = DataFrame(self.frame._series, dtype=int) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 86cd0ef524b35..8e1ea569973a6 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -772,6 +772,13 @@ def test_dups_fancy_indexing(self): expected = Index(['b','a','a']) self.assert_(result.equals(expected)) + # across dtypes + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) + result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) + result.columns = list('aaaaaaa') + assert_frame_equal(df,result) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index eec5f5632d36b..e25bd0de769a7 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -268,7 +268,7 @@ def test_duplicate_item_failure(self): b.ref_items = items mgr = BlockManager(blocks, [items, np.arange(N)]) - self.assertRaises(Exception, mgr.iget, 1) + mgr.iget(1) def test_contains(self): self.assert_('a' in self.mgr)