Skip to content

[WIP] Test (and more fixes) for duplicate indices with concat #38745

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,20 @@ def _get_combined_index(
indexes = _get_distinct_objs(indexes)
if len(indexes) == 0:
index = Index([])
elif len(indexes) == 1:
elif len(indexes) == 1 or all_indexes_same(indexes):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add some comments here as it is non-obvious what is happening

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this?

Suggested change
elif len(indexes) == 1 or all_indexes_same(indexes):
# if unique by id or unique by value
elif len(indexes) == 1 or all_indexes_same(indexes):

index = indexes[0]
elif intersect:
duplicates = union_indexes(
[index[index.duplicated(keep="first")] for index in indexes]
)
index = indexes[0]
for other in indexes[1:]:
index = index.intersection(other)
if len(duplicates.intersection(index)) > 0:
raise InvalidIndexError("Duplicated values in intersection of indices.")
else:
if not all(idx.is_unique for idx in indexes):
raise InvalidIndexError("Cannot union indices with duplicate values.")
index = union_indexes(indexes, sort=sort)
index = ensure_index(index)

Expand Down
41 changes: 41 additions & 0 deletions pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import numpy as np
import pytest

from pandas.errors import InvalidIndexError

from pandas.core.dtypes.common import is_dtype_equal

import pandas as pd
Expand All @@ -21,6 +23,7 @@
)
import pandas._testing as tm
from pandas.api.types import is_datetime64tz_dtype, pandas_dtype
from pandas.core.indexes.api import get_objs_combined_axis

COMPATIBLE_INCONSISTENT_PAIRS = {
(Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex),
Expand Down Expand Up @@ -463,3 +466,41 @@ def test_setop_with_categorical(index, sort, method):
result = getattr(index, method)(other[:5], sort=sort)
expected = getattr(index, method)(index[:5], sort=sort)
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize("reverse", [True, False])
def test_valid_intersection_w_dupes(index, reverse):
# Make sure base index is unique and has at least 3 values
index = index.unique()
if len(index) < 3:
pytest.skip()

series = [
pd.Series(1, index=index[[0, 0, 1, 2]]),
pd.Series(0, index=index[[1, 2]]),
]
if reverse:
series = reversed(series)

result = get_objs_combined_axis(series, intersect=True)
expected = index[[1, 2]]

tm.assert_index_equal(result, expected, check_order=False)


@pytest.mark.parametrize("reverse", [True, False])
def test_invalid_intersection_w_dupes(index, reverse):
# Make sure base index is unique and has at least 3 values
index = index.unique()
if len(index) < 3:
pytest.skip()

series = [
pd.Series(1, index=index[[0, 0, 1, 2]]),
pd.Series(0, index=index[[0, 2]]),
]
if reverse:
series = reversed(series)

with pytest.raises(InvalidIndexError):
_ = get_objs_combined_axis(series, intersect=True)
53 changes: 53 additions & 0 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import numpy as np
import pytest

from pandas.errors import InvalidIndexError

import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range
import pandas._testing as tm
Expand Down Expand Up @@ -445,6 +447,57 @@ def test_concat_ordered_dict(self):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("join", ["inner", "outer"])
def test_concat_duplicates_error(index, join):
# https://github.com/pandas-dev/pandas/issues/6963
# Needs an index with 4 unique values
index = index.unique()
if len(index) < 4:
pytest.skip()

index_unique = index[:4]
index_non_unique = index_unique[[0, 0, 1, 2, 3]]

df_non_unique = DataFrame(
np.ones((1, len(index_non_unique))), columns=index_non_unique
)
df_unique = DataFrame(np.ones((1, len(index_unique))), columns=index_unique)

with pytest.raises(InvalidIndexError):
_ = pd.concat([df_non_unique, df_unique], join=join)


@pytest.mark.xfail(reason="Not implemented")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there an issue for this? what is this case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was expecting this to be allowed:

import pandas as pd

result = pd.concat(
    [
        pd.Series(0, index=[0, 0, 1, 2]),
        pd.Series(1, index=[1, 2]),
    ],
    join="inner",
)
expected = pd.DataFrame({0: [0, 0], 1: [1, 1]}, index=[1, 2])

pd.testing.assert_frame_equal(result, expected)

Because the intersection of those indices is well defined. However, it turns out this does not work, and also doesn't work in 1.1.5. I sort of opened this issue here: #38773, but that was a more low-level issue.

def test_concat_intersection_duplicates(index):
# ailing: https://github.com/pandas-dev/pandas/pull/38745/files#r549577521
# Concat is valid if the intersection does not contain duplicates
# Needs an index with 4 unique values
index = index.unique()
if len(index) < 4:
pytest.skip()

index_unique = index[[0, 1, 2]]
index_non_unique = index[[1, 2, 3, 3]]

df_unique = DataFrame(
np.ones((1, len(index_unique))),
columns=index_unique,
)
df_non_unique = DataFrame(
np.zeros((1, len(index_non_unique))),
columns=index_non_unique,
)

result = pd.concat([df_unique, df_non_unique], join="inner")
expected = DataFrame(
[[1, 1], [0, 0]],
columns=index[[1, 2]],
index=[0, 0],
)

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("pdt", [Series, pd.DataFrame])
@pytest.mark.parametrize("dt", np.sctypes["float"])
def test_concat_no_unnecessary_upcast(dt, pdt):
Expand Down