|
12 | 12 | from numpy.random import randn
|
13 | 13 | import numpy as np
|
14 | 14 |
|
15 |
| -from pandas.compat import lrange, PY35 |
| 15 | +from pandas.compat import lrange, PY35, string_types |
16 | 16 | from pandas import (compat, isna, notna, DataFrame, Series,
|
17 | 17 | MultiIndex, date_range, Timestamp, Categorical,
|
18 | 18 | _np_version_under1p12,
|
@@ -1523,6 +1523,137 @@ def test_isin_empty_datetimelike(self):
|
1523 | 1523 | # ----------------------------------------------------------------------
|
1524 | 1524 | # Row deduplication
|
1525 | 1525 |
|
| 1526 | + @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) |
| 1527 | + def test_duplicated_with_misspelled_column_name(self, subset): |
| 1528 | + # GH 19730 |
| 1529 | + df = pd.DataFrame({'A': [0, 0, 1], |
| 1530 | + 'B': [0, 0, 1], |
| 1531 | + 'C': [0, 0, 1]}) |
| 1532 | + |
| 1533 | + with pytest.raises(KeyError): |
| 1534 | + df.duplicated(subset) |
| 1535 | + |
| 1536 | + with pytest.raises(KeyError): |
| 1537 | + df.drop_duplicates(subset) |
| 1538 | + |
| 1539 | + @pytest.mark.slow |
| 1540 | + def test_duplicated_do_not_fail_on_wide_dataframes(self): |
| 1541 | + # gh-21524 |
| 1542 | + # Given the wide dataframe with a lot of columns |
| 1543 | + # with different (important!) values |
| 1544 | + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) |
| 1545 | + for i in range(100)} |
| 1546 | + df = pd.DataFrame(data).T |
| 1547 | + result = df.duplicated() |
| 1548 | + |
| 1549 | + # Then duplicates produce the bool pd.Series as a result |
| 1550 | + # and don't fail during calculation. |
| 1551 | + # Actual values doesn't matter here, though usually |
| 1552 | + # it's all False in this case |
| 1553 | + assert isinstance(result, pd.Series) |
| 1554 | + assert result.dtype == np.bool |
| 1555 | + |
| 1556 | + @pytest.mark.parametrize('keep, expected', [ |
| 1557 | + ('first', Series([False, False, True, False, True])), |
| 1558 | + ('last', Series([True, True, False, False, False])), |
| 1559 | + (False, Series([True, True, True, False, True])) |
| 1560 | + ]) |
| 1561 | + def test_duplicated_keep(self, keep, expected): |
| 1562 | + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) |
| 1563 | + |
| 1564 | + result = df.duplicated(keep=keep) |
| 1565 | + tm.assert_series_equal(result, expected) |
| 1566 | + |
| 1567 | + @pytest.mark.parametrize('keep, expected', [ |
| 1568 | + ('first', Series([False, False, True, True, True])), |
| 1569 | + ('last', Series([True, True, False, True, False])), |
| 1570 | + (False, Series([True] * 5)) |
| 1571 | + ]) |
| 1572 | + def test_duplicated_nan_none(self, keep, expected): |
| 1573 | + # np.nan and None are considered equal |
| 1574 | + df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) |
| 1575 | + |
| 1576 | + result = df.duplicated(keep=keep) |
| 1577 | + tm.assert_series_equal(result, expected) |
| 1578 | + |
| 1579 | + @pytest.mark.parametrize('keep', ['first', 'last', False]) |
| 1580 | + @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) |
| 1581 | + def test_duplicated_subset(self, subset, keep): |
| 1582 | + df = DataFrame({'A': [0, 1, 1, 2, 0], |
| 1583 | + 'B': ['a', 'b', 'b', 'c', 'a'], |
| 1584 | + 'C': [np.nan, 3, 3, None, np.nan]}) |
| 1585 | + |
| 1586 | + if subset is None: |
| 1587 | + subset = list(df.columns) |
| 1588 | + |
| 1589 | + expected = df[subset].duplicated(keep=keep).rename(name=None) |
| 1590 | + result = df.duplicated(keep=keep, subset=subset) |
| 1591 | + tm.assert_series_equal(result, expected) |
| 1592 | + |
| 1593 | + def test_duplicated_inverse(self): |
| 1594 | + # check that return_inverse kwarg does not affect outcome; |
| 1595 | + # index of inverse must be correctly transformed as well |
| 1596 | + idx = [1, 4, 9, 16, 25] |
| 1597 | + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}, |
| 1598 | + index=idx) |
| 1599 | + |
| 1600 | + # keep = 'first' |
| 1601 | + expected_isdup = df.duplicated(keep='first') |
| 1602 | + expected_inv = Series([1, 4, 4, 16, 1], index=idx) |
| 1603 | + result_isdup, result_inv = df.duplicated(keep='first', |
| 1604 | + return_inverse=True) |
| 1605 | + tm.assert_series_equal(result_isdup, expected_isdup) |
| 1606 | + tm.assert_series_equal(result_inv, expected_inv) |
| 1607 | + |
| 1608 | + # test that inv works (and fits together with expected_isdup) |
| 1609 | + unique = df.loc[~expected_isdup] |
| 1610 | + reconstr = unique.reindex(result_inv).set_index(result_inv.index) |
| 1611 | + tm.assert_frame_equal(reconstr, df) |
| 1612 | + |
| 1613 | + # keep = 'last' |
| 1614 | + expected_isdup = df.duplicated(keep='last') |
| 1615 | + expected_inv = Series([25, 9, 9, 16, 25], index=idx) |
| 1616 | + result_isdup, result_inv = df.duplicated(keep='last', |
| 1617 | + return_inverse=True) |
| 1618 | + tm.assert_series_equal(result_isdup, expected_isdup) |
| 1619 | + tm.assert_series_equal(result_inv, expected_inv) |
| 1620 | + |
| 1621 | + # test that inv works (and fits together with expected_isdup) |
| 1622 | + unique = df.loc[~expected_isdup] |
| 1623 | + reconstr = unique.reindex(result_inv).set_index(result_inv.index) |
| 1624 | + tm.assert_frame_equal(reconstr, df) |
| 1625 | + |
| 1626 | + def test_duplicated_inverse_raises(self): |
| 1627 | + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) |
| 1628 | + |
| 1629 | + # keep = False |
| 1630 | + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' |
| 1631 | + with tm.assert_raises_regex(ValueError, rgx): |
| 1632 | + df.duplicated(keep=False, return_inverse=True) |
| 1633 | + |
| 1634 | + @pytest.mark.parametrize('keep', ['first', 'last']) |
| 1635 | + @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) |
| 1636 | + def test_duplicated_inverse_large(self, subset, keep): |
| 1637 | + # unsorted index important to check 'first'/'last' functionality |
| 1638 | + df = DataFrame(np.random.randint(0, 10, (10000, 3)), |
| 1639 | + columns=list('ABC')).sample(5000) |
| 1640 | + |
| 1641 | + expected_isdup = df.duplicated(keep=keep, subset=subset) |
| 1642 | + result_isdup, inv = df.duplicated(keep=keep, subset=subset, |
| 1643 | + return_inverse=True) |
| 1644 | + tm.assert_series_equal(result_isdup, expected_isdup) |
| 1645 | + |
| 1646 | + if subset is None: |
| 1647 | + subset = list(df.columns) |
| 1648 | + elif isinstance(subset, string_types): |
| 1649 | + # need to have a DataFrame, not a Series |
| 1650 | + # -> select columns with singleton list, not string |
| 1651 | + subset = [subset] |
| 1652 | + |
| 1653 | + unique = df.loc[~expected_isdup, subset] |
| 1654 | + reconstr = unique.reindex(inv.values).set_index(inv.index) |
| 1655 | + tm.assert_frame_equal(reconstr, df[subset]) |
| 1656 | + |
1526 | 1657 | def test_drop_duplicates(self):
|
1527 | 1658 | df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
|
1528 | 1659 | 'foo', 'bar', 'bar', 'foo'],
|
@@ -1618,36 +1749,6 @@ def test_drop_duplicates(self):
|
1618 | 1749 | for keep in ['first', 'last', False]:
|
1619 | 1750 | assert df.duplicated(keep=keep).sum() == 0
|
1620 | 1751 |
|
1621 |
| - @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) |
1622 |
| - def test_duplicated_with_misspelled_column_name(self, subset): |
1623 |
| - # GH 19730 |
1624 |
| - df = pd.DataFrame({'A': [0, 0, 1], |
1625 |
| - 'B': [0, 0, 1], |
1626 |
| - 'C': [0, 0, 1]}) |
1627 |
| - |
1628 |
| - with pytest.raises(KeyError): |
1629 |
| - df.duplicated(subset) |
1630 |
| - |
1631 |
| - with pytest.raises(KeyError): |
1632 |
| - df.drop_duplicates(subset) |
1633 |
| - |
1634 |
| - @pytest.mark.slow |
1635 |
| - def test_duplicated_do_not_fail_on_wide_dataframes(self): |
1636 |
| - # gh-21524 |
1637 |
| - # Given the wide dataframe with a lot of columns |
1638 |
| - # with different (important!) values |
1639 |
| - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) |
1640 |
| - for i in range(100)} |
1641 |
| - df = pd.DataFrame(data).T |
1642 |
| - result = df.duplicated() |
1643 |
| - |
1644 |
| - # Then duplicates produce the bool pd.Series as a result |
1645 |
| - # and don't fail during calculation. |
1646 |
| - # Actual values doesn't matter here, though usually |
1647 |
| - # it's all False in this case |
1648 |
| - assert isinstance(result, pd.Series) |
1649 |
| - assert result.dtype == np.bool |
1650 |
| - |
1651 | 1752 | def test_drop_duplicates_with_duplicate_column_names(self):
|
1652 | 1753 | # GH17836
|
1653 | 1754 | df = DataFrame([
|
|
0 commit comments