Skip to content

PERF: upgrade khash lib to 0.2.8 #8547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
124 changes: 123 additions & 1 deletion pandas/hashtable.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
from cpython cimport (PyObject, Py_INCREF, PyList_Check, PyTuple_Check,
PyString_AsStringAndSize, PyDict_Copy)

from khash cimport *
from numpy cimport *
Expand Down Expand Up @@ -843,6 +844,127 @@ cdef class PyObjectHashTable(HashTable):
return labels


cdef inline cbuf_t to_cbuf(object s):
cdef cbuf_t output
PyString_AsStringAndSize(s, <char**>&output.buf, &output.len)
return output


cdef class CBufHashTable(HashTable):
cdef kh_cbuf_map_t *table

def __cinit__(self, int size_hint=1):
self.table = kh_init_cbuf_map()
if size_hint is not None:
kh_resize_cbuf_map(self.table, size_hint)

def __dealloc__(self):
kh_destroy_cbuf_map(self.table)

cdef inline int check_type(self, object val):
return util.is_string_object(val)

cpdef get_item(self, object val):
cdef khiter_t it
it = kh_get_cbuf_map(self.table, to_cbuf(val))
if it != self.table.n_buckets:
return self.table.vals[it]
else:
raise KeyError(val)

def get_iter_test(self, object key, Py_ssize_t iterations):
cdef khiter_t it
cdef Py_ssize_t i, val
for i in range(iterations):
it = kh_get_cbuf_map(self.table, to_cbuf(key))
if it != self.table.n_buckets:
val = self.table.vals[it]

cpdef set_item(self, object key, Py_ssize_t val):
cdef:
khiter_t it
int ret = 0
cbuf_t buf

buf = to_cbuf(key)

it = kh_put_cbuf_map(self.table, buf, &ret)
self.table.keys[it] = buf
if kh_exist_cbuf_map(self.table, it):
self.table.vals[it] = val
else:
raise KeyError(key)

def get_indexer(self, ndarray[object] values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
cbuf_t buf
int64_t[::1] out = labels
khiter_t it
kh_cbuf_map_t *table = self.table

for i in range(n):
buf = to_cbuf(values[i])
it = kh_get_cbuf_map(table, buf)
if it != table.n_buckets:
out[i] = table.vals[it]
else:
out[i] = -1
return labels

def unique(self, ndarray[object] values):
cdef:
Py_ssize_t i, n = len(values)
Py_ssize_t idx, count = 0
int ret = 0
object val
cbuf_t buf
khiter_t it
ObjectVector uniques = ObjectVector()

for i in range(n):
val = values[i]
buf = to_cbuf(val)
it = kh_get_cbuf_map(self.table, buf)
if it == self.table.n_buckets:
it = kh_put_cbuf_map(self.table, buf, &ret)
count += 1
uniques.append(val)

return uniques.to_array()

def factorize(self, ndarray[object] values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
list reverse = []
Py_ssize_t idx, count = 0
int ret = 0
object val
cbuf_t buf
khiter_t it

for i in range(n):
val = values[i]
buf = to_cbuf(val)
it = kh_get_cbuf_map(self.table, buf)
if it != self.table.n_buckets:
idx = self.table.vals[it]
labels[i] = idx
else:
it = kh_put_cbuf_map(self.table, buf, &ret)

self.table.vals[it] = count
reverse.append(val)
labels[i] = count
count += 1

return PyDict_Copy(enumerate(reverse)), labels




cdef class Factorizer:
cdef public PyObjectHashTable table
cdef public ObjectVector uniques
Expand Down
25 changes: 18 additions & 7 deletions pandas/src/khash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ cdef extern from "khash_python.h":
inline khint_t kh_get_pymap(kh_pymap_t*, PyObject*)
inline void kh_resize_pymap(kh_pymap_t*, khint_t)
inline khint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*)
inline void kh_del_pymap(kh_pymap_t*, khint_t)

bint kh_exist_pymap(kh_pymap_t*, khiter_t)

Expand All @@ -33,7 +32,6 @@ cdef extern from "khash_python.h":
inline khint_t kh_get_pyset(kh_pyset_t*, PyObject*)
inline void kh_resize_pyset(kh_pyset_t*, khint_t)
inline khint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*)
inline void kh_del_pyset(kh_pyset_t*, khint_t)

bint kh_exist_pyset(kh_pyset_t*, khiter_t)

Expand All @@ -51,7 +49,6 @@ cdef extern from "khash_python.h":
inline khint_t kh_get_str(kh_str_t*, kh_cstr_t)
inline void kh_resize_str(kh_str_t*, khint_t)
inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*)
inline void kh_del_str(kh_str_t*, khint_t)

bint kh_exist_str(kh_str_t*, khiter_t)

Expand All @@ -68,7 +65,6 @@ cdef extern from "khash_python.h":
inline khint_t kh_get_int64(kh_int64_t*, int64_t)
inline void kh_resize_int64(kh_int64_t*, khint_t)
inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*)
inline void kh_del_int64(kh_int64_t*, khint_t)

bint kh_exist_int64(kh_int64_t*, khiter_t)

Expand All @@ -84,7 +80,6 @@ cdef extern from "khash_python.h":
inline khint_t kh_get_float64(kh_float64_t*, float64_t)
inline void kh_resize_float64(kh_float64_t*, khint_t)
inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*)
inline void kh_del_float64(kh_float64_t*, khint_t)

bint kh_exist_float64(kh_float64_t*, khiter_t)

Expand All @@ -100,7 +95,6 @@ cdef extern from "khash_python.h":
inline khint_t kh_get_int32(kh_int32_t*, int32_t)
inline void kh_resize_int32(kh_int32_t*, khint_t)
inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*)
inline void kh_del_int32(kh_int32_t*, khint_t)

bint kh_exist_int32(kh_int32_t*, khiter_t)

Expand All @@ -118,7 +112,24 @@ cdef extern from "khash_python.h":
inline khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t)
inline void kh_resize_strbox(kh_strbox_t*, khint_t)
inline khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*)
inline void kh_del_strbox(kh_strbox_t*, khint_t)

bint kh_exist_strbox(kh_strbox_t*, khiter_t)

ctypedef struct cbuf_t:
kh_cstr_t buf
Py_ssize_t len

ctypedef struct kh_cbuf_map_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
cbuf_t *keys
size_t *vals

inline kh_cbuf_map_t* kh_init_cbuf_map()
inline void kh_destroy_cbuf_map(kh_cbuf_map_t*)
inline void kh_clear_cbuf_map(kh_cbuf_map_t*)
inline khint_t kh_get_cbuf_map(kh_cbuf_map_t*, cbuf_t)
inline void kh_resize_cbuf_map(kh_cbuf_map_t*, khint_t)
inline khint_t kh_put_cbuf_map(kh_cbuf_map_t*, cbuf_t, int*)

bint kh_exist_cbuf_map(kh_cbuf_map_t*, khiter_t)
Loading