Skip to content

Commit 9940cfb

Browse files
committed
Merge pull request #6388 from jreback/dups_dropna
BUG: Bug in DataFrame.dropna with duplicate indices (GH6355)
2 parents c174c3d + 1d36bd9 commit 9940cfb

File tree

4 files changed

+26
-2
lines changed

4 files changed

+26
-2
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ Bug Fixes
126126
- Bug in interpolate changing dtypes (:issue:`6290`)
127127
- Bug in Series.get, was using a buggy access method (:issue:`6383`)
128128
- Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`)
129+
- Bug in DataFrame.dropna with duplicate indices (:issue:`6355`)
129130

130131
pandas 0.13.1
131132
-------------

pandas/core/frame.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2414,8 +2414,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
24142414

24152415
agg_obj = self
24162416
if subset is not None:
2417-
agg_axis_name = self._get_axis_name(agg_axis)
2418-
agg_obj = self.reindex(**{agg_axis_name: subset})
2417+
ax = self._get_axis(agg_axis)
2418+
agg_obj = self.take(ax.get_indexer_for(subset),axis=agg_axis)
24192419

24202420
count = agg_obj.count(axis=agg_axis)
24212421

pandas/core/index.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,6 +1210,12 @@ def get_indexer_non_unique(self, target, **kwargs):
12101210
indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
12111211
return Index(indexer), missing
12121212

1213+
def get_indexer_for(self, target, **kwargs):
1214+
""" guaranteed return of an indexer even when non-unique """
1215+
if self.is_unique:
1216+
return self.get_indexer(target, **kwargs)
1217+
return self.get_indexer_non_unique(target, **kwargs)[0]
1218+
12131219
def _possibly_promote(self, other):
12141220
# A hack, but it works
12151221
from pandas.tseries.index import DatetimeIndex

pandas/tests/test_frame.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3256,6 +3256,23 @@ def test_column_dups2(self):
32563256
result = df2.drop('C',axis=1)
32573257
assert_frame_equal(result, expected)
32583258

3259+
# dropna
3260+
df = DataFrame({'A' : np.random.randn(5),
3261+
'B' : np.random.randn(5),
3262+
'C' : np.random.randn(5),
3263+
'D' : ['a','b','c','d','e'] })
3264+
df.iloc[2,[0,1,2]] = np.nan
3265+
df.iloc[0,0] = np.nan
3266+
df.iloc[1,1] = np.nan
3267+
df.iloc[:,3] = np.nan
3268+
expected = df.dropna(subset=['A','B','C'],how='all')
3269+
expected.columns = ['A','A','B','C']
3270+
3271+
df.columns = ['A','A','B','C']
3272+
3273+
result = df.dropna(subset=['A','C'],how='all')
3274+
assert_frame_equal(result, expected)
3275+
32593276
def test_column_dups_indexing(self):
32603277

32613278
def check(result, expected=None):

0 commit comments

Comments
 (0)