Skip to content

Commit 6737d3a

Browse files
Nico CernekMarco Gorelli
Nico Cernek
authored and
Marco Gorelli
committed
add failing test to check row order preservation
correct the imports broken commit with a bunch of print statements and comments add test for left merge swap left and right keys when how == "right" correct old test: right-merge row order is now the same as the right df clean up spacing and delete temp code add whatsnew replace .from_records with default constructor add GH issue # to tests revert commit ed54bec change logic to swap left and right if how==right clean formatting rename vars and add comment for clarity combine tests into one update whatsnew Update doc/source/whatsnew/v1.0.0.rst Co-Authored-By: William Ayd <[email protected]> add before and after examples linting cleanup changes requested by jreback update docs
1 parent c02302d commit 6737d3a

File tree

3 files changed

+112
-14
lines changed

3 files changed

+112
-14
lines changed

doc/source/whatsnew/v1.0.0.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,31 @@ New repr for :class:`~pandas.arrays.IntervalArray`
307307
to ambiguous or undefined behavior. From pandas 1.0, only the very first argument, which
308308
maps labels to their new names along the default axis, is allowed to be passed by position
309309
(:issue:`29136`).
310+
- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
311+
:meth:`DataFrame.merge` preserves right frame's row order
312+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
313+
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
314+
315+
.. code-block:: python
316+
317+
left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1]))
318+
right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0]))
319+
left_df
320+
right_df
321+
322+
*pandas 0.25.x*
323+
324+
.. code-block:: python
325+
left_df.merge(right_df, left_index=True, right_index=True, how="right")
326+
327+
328+
*pandas 1.0.0*
329+
330+
.. code-block:: python
331+
left_df.merge(right_df, left_index=True, right_index=True, how="right")
332+
333+
334+
310335
311336
*pandas 0.25.x*
312337

@@ -1154,8 +1179,13 @@ Reshaping
11541179
- Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`)
11551180
- Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`)
11561181
- Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`)
1182+
<<<<<<< HEAD
11571183
- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`)
11581184
- Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`)
1185+
- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
1186+
=======
1187+
>>>>>>> 2b1b67592... changes requested by jreback
1188+
-
11591189

11601190
Sparse
11611191
^^^^^^

pandas/core/reshape/merge.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -568,10 +568,10 @@ def __init__(
568568
indicator: bool = False,
569569
validate=None,
570570
):
571-
_left = _validate_operand(left)
572-
_right = _validate_operand(right)
573-
self.left = self.orig_left = _left
574-
self.right = self.orig_right = _right
571+
left = validate_operand(left)
572+
right = validate_operand(right)
573+
self.left = self.orig_left = left
574+
self.right = self.orig_right = right
575575
self.how = how
576576
self.axis = axis
577577

@@ -1295,6 +1295,9 @@ def _get_join_indexers(
12951295
right_keys
12961296
), "left_key and right_keys must be the same length"
12971297

1298+
# bind `sort` arg. of _factorize_keys
1299+
fkeys = partial(_factorize_keys, sort=sort)
1300+
12981301
# get left & right join labels and num. of levels at each location
12991302
mapped = (
13001303
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
@@ -1309,15 +1312,20 @@ def _get_join_indexers(
13091312
# factorize keys to a dense i8 space
13101313
# `count` is the num. of unique keys
13111314
# set(lkey) | set(rkey) == range(count)
1312-
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
13131315

1316+
# flip left and right keys if performing a right merge
1317+
# to preserve right merge row order (GH 27453)
1318+
if how == "right":
1319+
factorized_rkey, factorized_lkey, count = fkeys(rkey, lkey)
1320+
else:
1321+
factorized_lkey, factorized_rkey, count = fkeys(lkey, rkey)
13141322
# preserve left frame order if how == 'left' and sort == False
13151323
kwargs = copy.copy(kwargs)
13161324
if how == "left":
13171325
kwargs["sort"] = sort
13181326
join_func = _join_functions[how]
13191327

1320-
return join_func(lkey, rkey, count, **kwargs)
1328+
return join_func(factorized_lkey, factorized_rkey, count, **kwargs)
13211329

13221330

13231331
def _restore_dropped_levels_multijoin(

pandas/tests/reshape/merge/test_merge.py

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,17 +1288,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
12881288
# GH 24212
12891289
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
12901290
# -1 is interpreted as a missing value instead of the last element
1291-
df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
1292-
df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
1291+
df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
1292+
df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
12931293
result = df1.merge(df2, left_on="key", right_index=True, how=how)
12941294
expected = pd.DataFrame(
12951295
[
1296-
[1.0, 0, 1],
1297-
[2.0, 2, 3],
1298-
[3.0, 2, 3],
1299-
[np.nan, 1, 2],
1300-
[np.nan, 3, 4],
1301-
[np.nan, 4, 5],
1296+
[0, 0, 0],
1297+
[1, 1, 1],
1298+
[2, 2, 2],
1299+
[np.nan, 3, 3],
1300+
[np.nan, 4, 4],
1301+
[np.nan, 5, 5],
13021302
],
13031303
columns=["a", "key", "b"],
13041304
)
@@ -2169,3 +2169,63 @@ def test_merge_datetime_upcast_dtype():
21692169
}
21702170
)
21712171
tm.assert_frame_equal(result, expected)
2172+
2173+
2174+
@pytest.mark.parametrize("how", ["left", "right"])
2175+
def test_merge_preserves_row_order(how):
2176+
# GH 27453
2177+
population = [
2178+
("Jenn", "Jamaica", 3),
2179+
("Beth", "Bulgaria", 7),
2180+
("Carl", "Canada", 30),
2181+
]
2182+
columns = ["name", "country", "population"]
2183+
population_df = DataFrame(population, columns=columns)
2184+
2185+
people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")]
2186+
columns = ["name", "country"]
2187+
people_df = DataFrame(people, columns=columns)
2188+
2189+
expected_data = [
2190+
("Abe", "America", np.nan),
2191+
("Beth", "Bulgaria", 7),
2192+
("Carl", "Canada", 30),
2193+
]
2194+
expected_cols = ["name", "country", "population"]
2195+
expected = DataFrame(expected_data, columns=expected_cols)
2196+
2197+
result = pop.merge(ppl, on=("name", "country"), how="right")
2198+
2199+
tm.assert_frame_equal(result, expected)
2200+
2201+
2202+
def test_left_merge_preserves_row_order():
2203+
# GH 27453
2204+
population = [
2205+
("Jenn", "Jamaica", 3),
2206+
("Beth", "Bulgaria", 7),
2207+
("Carl", "Canada", 30),
2208+
]
2209+
columns = ["name", "country", "population"]
2210+
pop = DataFrame(population, columns=columns)
2211+
2212+
people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")]
2213+
columns = ["name", "country"]
2214+
ppl = DataFrame(people, columns=columns)
2215+
2216+
expected_data = [
2217+
("Abe", "America", np.nan),
2218+
("Beth", "Bulgaria", 7),
2219+
("Carl", "Canada", 30),
2220+
]
2221+
expected_cols = ["name", "country", "population"]
2222+
expected = DataFrame(expected_data, columns=expected_cols)
2223+
2224+
result = ppl.merge(pop, on=("name", "country"), how="left")
2225+
if how == "right":
2226+
left_df, right_df = population_df, people_df
2227+
elif how == "left":
2228+
left_df, right_df = people_df, population_df
2229+
2230+
result = left_df.merge(right_df, on=("name", "country"), how=how)
2231+
tm.assert_frame_equal(expected, result)

0 commit comments

Comments
 (0)