Skip to content

Commit c7ee203

Browse files
committed
ENH: merge/join functions, compress group index with possible number of groups is insanely large, speed enhancements, refactoring
1 parent e4e9e94 commit c7ee203

File tree

11 files changed

+278
-78
lines changed

11 files changed

+278
-78
lines changed

TODO.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
DONE
1+
meDONE
22
----
33
- SparseSeries name integration + tests
44
- Refactor Series.repr

bench/bench_merge.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
N <- 10000
2+
indices = rep(NA, N)
3+
for (i in 1:N)
4+
indices[i] <- paste(sample(letters, 10), collapse="")
5+
6+
left <- data.frame(key=rep(indices, 10),
7+
key2=sample(rep(indices, 10)),
8+
value=rnorm(100000))
9+
right <- data.frame(key=indices,
10+
key2=sample(indices),
11+
value2=rnorm(10000))

bench/bench_merge.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from pandas import *
2+
from pandas.util.testing import rands
23
import random
34

45
N = 10000
@@ -30,3 +31,19 @@ def get_test_data(ngroups=100, n=N):
3031
reload(merge)
3132

3233
result = merge.merge(df, df2, on='key2')
34+
35+
from pandas.util.testing import rands
36+
N = 10000
37+
indices = np.array([rands(10) for _ in xrange(N)], dtype='O')
38+
39+
key = np.tile(indices, 10)
40+
key2 = key.copy()
41+
random.shuffle(key2)
42+
indices2 = indices.copy()
43+
random.shuffle(indices2)
44+
45+
46+
left = DataFrame({'key' : key, 'key2':key2,
47+
'value' : np.random.randn(100000)})
48+
right = DataFrame({'key': indices, 'key2':indices2,
49+
'value2' : np.random.randn(10000)})

pandas/core/generic.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -432,23 +432,6 @@ def cumprod(self, axis=None, skipna=True):
432432
result = y.cumprod(axis)
433433
return self._wrap_array(result, self.axes, copy=False)
434434

435-
def _values_aggregate(self, func, axis, fill_value, skipna=True):
436-
axis = self._get_axis_number(axis)
437-
438-
values = self.values
439-
mask = np.isfinite(values)
440-
441-
if skipna and fill_value is not None:
442-
values = values.copy()
443-
values[-mask] = fill_value
444-
445-
result = func(values, axis=axis)
446-
count = mask.sum(axis=axis)
447-
448-
result[count == 0] = np.NaN
449-
450-
return result
451-
452435
def copy(self, deep=True):
453436
"""
454437
Make a copy of this object

pandas/src/hashtable.pyx

Lines changed: 153 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,113 @@ cdef class StringHashTable:
239239
# return None
240240
return reverse, labels, counts[:count].copy()
241241

242+
cdef class Int32HashTable:
243+
244+
cdef:
245+
kh_int32_t *table
246+
247+
def __init__(self, size_hint=1):
248+
if size_hint is not None:
249+
kh_resize_int32(self.table, size_hint)
250+
251+
def __cinit__(self):
252+
self.table = kh_init_int32()
253+
254+
def __dealloc__(self):
255+
kh_destroy_int32(self.table)
256+
257+
cdef inline int check_type(self, object val):
258+
return PyString_Check(val)
259+
260+
cpdef get_item(self, int32_t val):
261+
cdef khiter_t k
262+
k = kh_get_int32(self.table, val)
263+
if k != self.table.n_buckets:
264+
return self.table.vals[k]
265+
else:
266+
raise KeyError(val)
267+
268+
def get_iter_test(self, int32_t key, Py_ssize_t iterations):
269+
cdef Py_ssize_t i, val
270+
for i in range(iterations):
271+
k = kh_get_int32(self.table, val)
272+
if k != self.table.n_buckets:
273+
val = self.table.vals[k]
274+
275+
cpdef set_item(self, int32_t key, Py_ssize_t val):
276+
cdef:
277+
khiter_t k
278+
int ret
279+
280+
k = kh_put_int32(self.table, key, &ret)
281+
self.table.keys[k] = key
282+
if kh_exist_int32(self.table, k):
283+
self.table.vals[k] = val
284+
else:
285+
raise KeyError(key)
286+
287+
def map_locations(self, ndarray[int32_t] values):
288+
cdef:
289+
Py_ssize_t i, n = len(values)
290+
int ret
291+
int32_t val
292+
khiter_t k
293+
294+
for i in range(n):
295+
val = values[i]
296+
k = kh_put_int32(self.table, val, &ret)
297+
# print 'putting %s, %s' % (val, count)
298+
self.table.vals[k] = i
299+
300+
def lookup_locations(self, ndarray[int32_t] values):
301+
cdef:
302+
Py_ssize_t i, n = len(values)
303+
int ret
304+
int32_t val
305+
khiter_t k
306+
ndarray[int32_t] locs = np.empty(n, dtype='i4')
307+
308+
for i in range(n):
309+
val = values[i]
310+
k = kh_get_int32(self.table, val)
311+
if k != self.table.n_buckets:
312+
locs[i] = self.table.vals[k]
313+
else:
314+
locs[i] = -1
315+
316+
return locs
317+
318+
def factorize(self, ndarray[int32_t] values):
319+
cdef:
320+
Py_ssize_t i, n = len(values)
321+
ndarray[int32_t] labels = np.empty(n, dtype=np.int32)
322+
ndarray[int32_t] counts = np.empty(n, dtype=np.int32)
323+
dict reverse = {}
324+
Py_ssize_t idx, count = 0
325+
int ret
326+
int32_t val
327+
khiter_t k
328+
329+
for i in range(n):
330+
val = values[i]
331+
k = kh_get_int32(self.table, val)
332+
if k != self.table.n_buckets:
333+
idx = self.table.vals[k]
334+
labels[i] = idx
335+
counts[idx] = counts[idx] + 1
336+
else:
337+
k = kh_put_int32(self.table, val, &ret)
338+
if not ret:
339+
kh_del_int32(self.table, k)
340+
self.table.vals[k] = count
341+
reverse[count] = val
342+
labels[i] = count
343+
counts[count] = 1
344+
count += 1
345+
346+
# return None
347+
return reverse, labels, counts[:count].copy()
348+
242349
cdef class Int64HashTable:
243350

244351
cdef:
@@ -315,17 +422,25 @@ cdef class Int64HashTable:
315422

316423
return locs
317424

318-
def factorize(self, ndarray[int64_t] values):
425+
def factorize(self, ndarray[object] values):
426+
reverse = {}
427+
labels, counts = self.get_labels(values, reverse, 0)
428+
return reverse, labels, counts
429+
430+
def get_labels(self, ndarray[int64_t] values, list uniques,
431+
Py_ssize_t count_prior):
319432
cdef:
320433
Py_ssize_t i, n = len(values)
321-
ndarray[int32_t] labels = np.empty(n, dtype=np.int32)
322-
ndarray[int32_t] counts = np.empty(n, dtype=np.int32)
323-
dict reverse = {}
324-
Py_ssize_t idx, count = 0
434+
ndarray[int32_t] labels
435+
ndarray[int32_t] counts
436+
Py_ssize_t idx, count = count_prior
325437
int ret
326438
int64_t val
327439
khiter_t k
328440

441+
labels = np.empty(n, dtype=np.int32)
442+
counts = np.empty(count_prior + n, dtype=np.int32)
443+
329444
for i in range(n):
330445
val = values[i]
331446
k = kh_get_int64(self.table, val)
@@ -335,16 +450,13 @@ cdef class Int64HashTable:
335450
counts[idx] = counts[idx] + 1
336451
else:
337452
k = kh_put_int64(self.table, val, &ret)
338-
if not ret:
339-
kh_del_int64(self.table, k)
340453
self.table.vals[k] = count
341-
reverse[count] = val
454+
uniques.append(val)
342455
labels[i] = count
343456
counts[count] = 1
344457
count += 1
345458

346-
# return None
347-
return reverse, labels, counts[:count].copy()
459+
return labels, counts[:count].copy()
348460

349461
cdef class PyObjectHashTable:
350462

@@ -526,6 +638,37 @@ cdef class Factorizer:
526638
self.count = len(counts)
527639
return labels, counts
528640

641+
cdef class Int64Factorizer:
642+
643+
cdef public:
644+
Int64HashTable table
645+
list uniques
646+
Py_ssize_t count
647+
648+
def __init__(self, size_hint):
649+
self.table = Int64HashTable(size_hint)
650+
self.uniques = []
651+
self.count = 0
652+
653+
def get_count(self):
654+
return self.count
655+
656+
def factorize(self, ndarray[int64_t] values, sort=False):
657+
labels, counts = self.table.get_labels(values, self.uniques,
658+
self.count)
659+
660+
# sort on
661+
if sort:
662+
sorter = list_to_object_array(self.uniques).argsort()
663+
reverse_indexer = np.empty(len(sorter), dtype=np.int32)
664+
reverse_indexer.put(sorter, np.arange(len(sorter)))
665+
666+
labels = reverse_indexer.take(labels)
667+
counts = counts.take(sorter)
668+
669+
self.count = len(counts)
670+
return labels, counts
671+
529672
def lookup_locations2(ndarray[object] values):
530673
cdef:
531674
Py_ssize_t i, n = len(values)

pandas/src/iterator.pyx

Lines changed: 0 additions & 33 deletions
This file was deleted.

pandas/src/join.pyx

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import time
12

23
def inner_join(ndarray[int32_t] left, ndarray[int32_t] right,
34
Py_ssize_t max_groups):
@@ -170,3 +171,34 @@ def _get_result_indexer(sorter, indexer):
170171
res = sorter.take(indexer)
171172
np.putmask(res, indexer == -1, -1)
172173
return res
174+
175+
176+
@cython.boundscheck(False)
177+
@cython.wraparound(False)
178+
def join_sorter(ndarray[int32_t] index, Py_ssize_t ngroups):
179+
cdef:
180+
Py_ssize_t i, loc, label, n
181+
ndarray[int32_t] counts, where, result
182+
183+
# count group sizes, location 0 for NA
184+
counts = np.zeros(ngroups + 1, dtype='i4')
185+
n = len(index)
186+
for i from 0 <= i < n:
187+
counts[index[i] + 1] += 1
188+
189+
# mark the start of each contiguous group of like-indexed data
190+
where = np.zeros(ngroups + 1, dtype='i4')
191+
for i from 1 <= i < ngroups + 1:
192+
where[i] = where[i - 1] + counts[i - 1]
193+
194+
# this is our indexer
195+
result = np.zeros(n, dtype='i4')
196+
for i from 0 <= i < n:
197+
label = index[i] + 1
198+
result[where[label]] = i
199+
where[label] += 1
200+
201+
return result, counts
202+
203+
def _big_join_sorter(index):
204+
pass

pandas/src/khash.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,9 +585,11 @@ KHASH_SET_INIT_PYOBJECT(pyset)
585585
#define kh_exist_pyset(h, k) (kh_exist(h, k))
586586
#define kh_exist_str(h, k) (kh_exist(h, k))
587587
#define kh_exist_int64(h, k) (kh_exist(h, k))
588+
#define kh_exist_int32(h, k) (kh_exist(h, k))
588589

589590
KHASH_MAP_INIT_STR(str, Py_ssize_t)
590591

592+
KHASH_MAP_INIT_INT(int32, Py_ssize_t)
591593
KHASH_MAP_INIT_INT64(int64, Py_ssize_t)
592594

593595
#endif /* __AC_KHASH_H */

pandas/src/khash.pxd

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from cpython cimport PyObject
2-
from numpy cimport int64_t, uint32_t
2+
from numpy cimport int64_t, int32_t, uint32_t
33

44
cdef extern from "khash.h":
55
ctypedef uint32_t khint_t
@@ -71,3 +71,19 @@ cdef extern from "khash.h":
7171

7272
bint kh_exist_int64(kh_int64_t*, khiter_t)
7373

74+
ctypedef struct kh_int32_t:
75+
khint_t n_buckets, size, n_occupied, upper_bound
76+
uint32_t *flags
77+
int32_t *keys
78+
Py_ssize_t *vals
79+
80+
inline kh_int32_t* kh_init_int32()
81+
inline void kh_destroy_int32(kh_int32_t*)
82+
inline void kh_clear_int32(kh_int32_t*)
83+
inline khint_t kh_get_int32(kh_int32_t*, int32_t)
84+
inline void kh_resize_int32(kh_int32_t*, khint_t)
85+
inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*)
86+
inline void kh_del_int32(kh_int32_t*, khint_t)
87+
88+
bint kh_exist_int32(kh_int32_t*, khiter_t)
89+

pandas/src/skiplist.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ cdef class IndexableSkiplist:
7575
i -= node.width[level]
7676
node = node.next[level]
7777

78+
7879
return node.value
7980

8081
cpdef insert(self, double value):

0 commit comments

Comments
 (0)