Skip to content

ENH: Faster merge_asof() performs a single pass when joining tables (#13902) #13903

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def time_merge_asof_noby(self):
merge_asof(self.df1, self.df2, on='time')


class merge_asof_by(object):
class merge_asof_by_object(object):

def setup(self):
import string
Expand All @@ -326,7 +326,26 @@ def setup(self):
self.df1 = self.df1.sort_values('time')
self.df2 = self.df2.sort_values('time')

def time_merge_asof_by(self):
def time_merge_asof_by_object(self):
merge_asof(self.df1, self.df2, on='time', by='key')


class merge_asof_by_int(object):

def setup(self):
np.random.seed(0)
one_count = 200000
two_count = 1000000
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
'key': np.random.randint(0, 25, one_count),
'value1': np.random.randn(one_count)})
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
'key': np.random.randint(0, 25, two_count),
'value2': np.random.randn(two_count)})
self.df1 = self.df1.sort_values('time')
self.df2 = self.df2.sort_values('time')

def time_merge_asof_by_int(self):
merge_asof(self.df1, self.df2, on='time', by='key')


Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ The following are now part of this API:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A long-time requested feature has been added through the :func:`merge_asof` function, to
support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`). Full documentation is
support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`, :issue:`13902`). Full documentation is
:ref:`here <merging.merge_asof>`

The :func:`merge_asof` performs an asof merge, which is similar to a left-join
Expand Down
60 changes: 2 additions & 58 deletions pandas/src/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ cdef double nan = NaN

from pandas.algos import groupsort_indexer

include "joins_func_helper.pxi"


def inner_join(ndarray[int64_t] left, ndarray[int64_t] right,
Py_ssize_t max_groups):
Expand Down Expand Up @@ -162,64 +164,6 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
return left_indexer, right_indexer


def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right,
Py_ssize_t max_groups, # ignored
bint allow_exact_matches=1,
left_values=None,
right_values=None,
tolerance=None):

cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
ndarray[int64_t] left_values_, right_values_
int64_t tolerance_

# if we are using tolerance, set our objects
if (left_values is not None and right_values is not None and
tolerance is not None):
has_tolerance = 1
left_values_ = left_values
right_values_ = right_values
tolerance_ = tolerance

left_size = len(left)
right_size = len(right)

left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)

right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0

# find last position in right whose value is less than left's value
if allow_exact_matches:
while (right_pos < right_size and
right[right_pos] <= left[left_pos]):
right_pos += 1
else:
while (right_pos < right_size and
right[right_pos] < left[left_pos]):
right_pos += 1
right_pos -= 1

# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos

# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1

return left_indexer, right_indexer


def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
Py_ssize_t max_groups):
cdef:
Expand Down
Loading