Skip to content

BUG/CLN: Allow the BlockManager to have a non-unique items (axis 0) #3509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 2, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,20 @@ pandas 0.11.1
- Fix regression in a DataFrame apply with axis=1, objects were not being converted back
to base dtypes correctly (GH3480_)
- Fix issue when storing uint dtypes in an HDFStore. (GH3493_)
- Non-unique index support clarified (GH3468_)

- Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_)
- Fix construction of a DataFrame with a duplicate index
- ref_locs support to allow duplicative indices across dtypes,
allows iget support to always find the index (even across dtypes) (GH2194_)
- applymap on a DataFrame with a non-unique index now works
(removed warning) (GH2786_), and fix (GH3230_)
- Fix to_csv to handle non-unique columns (GH3495_)

.. _GH3164: https://github.com/pydata/pandas/issues/3164
.. _GH2786: https://github.com/pydata/pandas/issues/2786
.. _GH2194: https://github.com/pydata/pandas/issues/2194
.. _GH3230: https://github.com/pydata/pandas/issues/3230
.. _GH3251: https://github.com/pydata/pandas/issues/3251
.. _GH3379: https://github.com/pydata/pandas/issues/3379
.. _GH3480: https://github.com/pydata/pandas/issues/3480
Expand All @@ -75,8 +87,10 @@ pandas 0.11.1
.. _GH3455: https://github.com/pydata/pandas/issues/3455
.. _GH3457: https://github.com/pydata/pandas/issues/3457
.. _GH3461: https://github.com/pydata/pandas/issues/3461
.. _GH3468: https://github.com/pydata/pandas/issues/3468
.. _GH3448: https://github.com/pydata/pandas/issues/3448
.. _GH3449: https://github.com/pydata/pandas/issues/3449
.. _GH3495: https://github.com/pydata/pandas/issues/3495
.. _GH3493: https://github.com/pydata/pandas/issues/3493


Expand Down
1 change: 1 addition & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,7 @@ def _default_index(n):
values = np.arange(n, dtype=np.int64)
result = values.view(Int64Index)
result.name = None
result.is_unique = True
return result


Expand Down
35 changes: 8 additions & 27 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,21 +820,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
self.blocks = self.obj._data.blocks
ncols = sum(len(b.items) for b in self.blocks)
self.data =[None] * ncols

if self.obj.columns.is_unique:
self.colname_map = dict((k,i) for i,k in enumerate(self.obj.columns))
else:
ks = [set(x.items) for x in self.blocks]
u = len(reduce(lambda a,x: a.union(x),ks,set()))
t = sum(map(len,ks))
if u != t:
if len(set(self.cols)) != len(self.cols):
raise NotImplementedError("duplicate columns with differing dtypes are unsupported")
else:
# if columns are not unique and we acces this,
# we're doing it wrong
pass

self.column_map = self.obj._data.get_items_map()

if chunksize is None:
chunksize = (100000/ (len(self.cols) or 1)) or 1
Expand Down Expand Up @@ -1034,18 +1020,13 @@ def _save_chunk(self, start_i, end_i):

# create the data for a chunk
slicer = slice(start_i,end_i)
if self.obj.columns.is_unique:
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
for j, k in enumerate(b.items):
# self.data is a preallocated list
self.data[self.colname_map[k]] = d[j]
else:
# self.obj should contain a proper view of the dataframes
# with the specified ordering of cols if cols was specified
for i in range(len(self.obj.columns)):
self.data[i] = self.obj.icol(i).values[slicer].tolist()
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
for i, item in enumerate(b.items):

# self.data is a preallocated list
self.data[self.column_map[b][i]] = d[i]

ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)

Expand Down
3 changes: 0 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4261,9 +4261,6 @@ def infer(x):
if com.is_datetime64_dtype(x):
x = lib.map_infer(x, lib.Timestamp)
return lib.map_infer(x, func)
#GH2786
if not self.columns.is_unique:
raise ValueError("applymap does not support dataframes having duplicate column labels")
return self.apply(infer)

#----------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def is_monotonic(self):
def is_lexsorted_for_tuple(self, tup):
return True

@cache_readonly
@cache_readonly(allow_setting=True)
def is_unique(self):
return self._engine.is_unique

Expand Down
Loading