Skip to content

Commit cff1f55

Browse files
Christopher C. Aycockjreback
Christopher C. Aycock
authored andcommitted
ENH: Faster merge_asof() performs a single pass when joining tables (#13902)
This version passes existing regression tests but is ultimately wrong because it requires the "by" column to be a single object. A proper version would handle int (and possily float) columns through type differentiation. Author: Christopher C. Aycock <[email protected]> Closes #13903 from chrisaycock/master and squashes the following commits: f0d0165 [Christopher C. Aycock] ENH: Faster merge_asof() performs a single pass when joining tables (#13902)
1 parent 7e15923 commit cff1f55

File tree

8 files changed

+755
-135
lines changed

8 files changed

+755
-135
lines changed

asv_bench/benchmarks/join_merge.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ def time_merge_asof_noby(self):
310310
merge_asof(self.df1, self.df2, on='time')
311311

312312

313-
class merge_asof_by(object):
313+
class merge_asof_by_object(object):
314314

315315
def setup(self):
316316
import string
@@ -326,7 +326,26 @@ def setup(self):
326326
self.df1 = self.df1.sort_values('time')
327327
self.df2 = self.df2.sort_values('time')
328328

329-
def time_merge_asof_by(self):
329+
def time_merge_asof_by_object(self):
330+
merge_asof(self.df1, self.df2, on='time', by='key')
331+
332+
333+
class merge_asof_by_int(object):
334+
335+
def setup(self):
336+
np.random.seed(0)
337+
one_count = 200000
338+
two_count = 1000000
339+
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
340+
'key': np.random.randint(0, 25, one_count),
341+
'value1': np.random.randn(one_count)})
342+
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
343+
'key': np.random.randint(0, 25, two_count),
344+
'value2': np.random.randn(two_count)})
345+
self.df1 = self.df1.sort_values('time')
346+
self.df2 = self.df2.sort_values('time')
347+
348+
def time_merge_asof_by_int(self):
330349
merge_asof(self.df1, self.df2, on='time', by='key')
331350

332351

doc/source/whatsnew/v0.19.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ The following are now part of this API:
4848
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4949

5050
A long-time requested feature has been added through the :func:`merge_asof` function, to
51-
support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`). Full documentation is
51+
support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`, :issue:`13902`). Full documentation is
5252
:ref:`here <merging.merge_asof>`
5353

5454
The :func:`merge_asof` performs an asof merge, which is similar to a left-join

pandas/src/join.pyx

Lines changed: 2 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ cdef double nan = NaN
3434

3535
from pandas.algos import groupsort_indexer
3636

37+
include "joins_func_helper.pxi"
38+
3739

3840
def inner_join(ndarray[int64_t] left, ndarray[int64_t] right,
3941
Py_ssize_t max_groups):
@@ -162,64 +164,6 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
162164
return left_indexer, right_indexer
163165

164166

165-
def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right,
166-
Py_ssize_t max_groups, # ignored
167-
bint allow_exact_matches=1,
168-
left_values=None,
169-
right_values=None,
170-
tolerance=None):
171-
172-
cdef:
173-
Py_ssize_t left_pos, right_pos, left_size, right_size
174-
ndarray[int64_t] left_indexer, right_indexer
175-
bint has_tolerance = 0
176-
ndarray[int64_t] left_values_, right_values_
177-
int64_t tolerance_
178-
179-
# if we are using tolerance, set our objects
180-
if (left_values is not None and right_values is not None and
181-
tolerance is not None):
182-
has_tolerance = 1
183-
left_values_ = left_values
184-
right_values_ = right_values
185-
tolerance_ = tolerance
186-
187-
left_size = len(left)
188-
right_size = len(right)
189-
190-
left_indexer = np.empty(left_size, dtype=np.int64)
191-
right_indexer = np.empty(left_size, dtype=np.int64)
192-
193-
right_pos = 0
194-
for left_pos in range(left_size):
195-
# restart right_pos if it went negative in a previous iteration
196-
if right_pos < 0:
197-
right_pos = 0
198-
199-
# find last position in right whose value is less than left's value
200-
if allow_exact_matches:
201-
while (right_pos < right_size and
202-
right[right_pos] <= left[left_pos]):
203-
right_pos += 1
204-
else:
205-
while (right_pos < right_size and
206-
right[right_pos] < left[left_pos]):
207-
right_pos += 1
208-
right_pos -= 1
209-
210-
# save positions as the desired index
211-
left_indexer[left_pos] = left_pos
212-
right_indexer[left_pos] = right_pos
213-
214-
# if needed, verify that tolerance is met
215-
if has_tolerance and right_pos != -1:
216-
diff = left_values[left_pos] - right_values[right_pos]
217-
if diff > tolerance_:
218-
right_indexer[left_pos] = -1
219-
220-
return left_indexer, right_indexer
221-
222-
223167
def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
224168
Py_ssize_t max_groups):
225169
cdef:

0 commit comments

Comments
 (0)