From 98f46c20ff81680e0c6724dd21a2c606da2f05e1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 26 Nov 2016 13:32:06 -0500 Subject: [PATCH] PERF: use StringHashTable for strings in factorizing allows releasing the GIL on these dtypes xref #13745 --- asv_bench/benchmarks/algorithms.py | 5 + asv_bench/benchmarks/gil.py | 35 +++ doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/algorithms.py | 33 ++- pandas/hashtable.pxd | 8 +- pandas/hashtable.pyx | 8 +- pandas/src/hashtable_class_helper.pxi.in | 305 ++++++++++++++++++----- 7 files changed, 330 insertions(+), 66 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index c4a6117c0704a..20d149493951f 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -8,6 +8,7 @@ class Algorithms(object): def setup(self): N = 100000 + np.random.seed(1234) self.int_unique = pd.Int64Index(np.arange(N * 5)) # cache is_unique @@ -23,11 +24,15 @@ def setup(self): self.arrpos = np.arange(1000000) self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) + self.strings = tm.makeStringIndex(100000) # match self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) + def time_factorize_string(self): + self.strings.factorize() + def time_factorize_int(self): self.int.factorize() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 3f53894364cd2..1c5e59672cb57 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -379,3 +379,38 @@ def pg_read_csv_datetime(self): def time_read_csv_datetime(self): self.pg_read_csv_datetime() + + +class nogil_factorize(object): + number = 1 + repeat = 5 + + def setup(self): + if (not have_real_test_parallel): + raise NotImplementedError + + np.random.seed(1234) + self.strings = tm.makeStringIndex(100000) + + def factorize_strings(self): + pd.factorize(self.strings) + + @test_parallel(num_threads=4) + def _pg_factorize_strings_4(self): + self.factorize_strings() + + def time_factorize_strings_4(self): + for i in range(2): + self._pg_factorize_strings_4() + + @test_parallel(num_threads=2) + def _pg_factorize_strings_2(self): + self.factorize_strings() + + def time_factorize_strings_2(self): + for i in range(4): + self._pg_factorize_strings_2() + + def time_factorize_strings(self): + for i in range(8): + self.factorize_strings() diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 508093380ac81..2855cde95ac2a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -135,7 +135,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) - +- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index effca6398419e..0d4d4143e6b9b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) - result = _hashtable_algo(f, values.dtype, np.int64) + result = _hashtable_algo(f, values, np.int64) if na_sentinel != -1: @@ -102,7 +102,7 @@ def unique(values): values = com._asarray_tuplesafe(values) f = lambda htype, caster: _unique_generic(values, htype, caster) - return _hashtable_algo(f, values.dtype) + return _hashtable_algo(f, values) def _unique_generic(values, table_type, type_caster): @@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr): # helpers # # ------- # -def _hashtable_algo(f, dtype, return_dtype=None): +def _hashtable_algo(f, values, return_dtype=None): """ f(HashTable, type_caster) -> result """ + + dtype = values.dtype if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_integer_dtype(dtype): @@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None): elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) - else: - return f(htable.PyObjectHashTable, _ensure_object) + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + return f(htable.StringHashTable, _ensure_object) + + # use Object + return f(htable.PyObjectHashTable, _ensure_object) _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), 'int64': (htable.Int64HashTable, htable.Int64Vector), + 'string': (htable.StringHashTable, htable.ObjectVector), 'generic': (htable.PyObjectHashTable, htable.ObjectVector) } def _get_data_algo(values, func_map): + + f = None if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -796,8 +806,19 @@ def _get_data_algo(values, func_map): f = func_map['int64'] values = _ensure_int64(values) else: - f = func_map['generic'] + values = _ensure_object(values) + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + try: + f = func_map['string'] + except KeyError: + pass + + if f is None: + f = func_map['generic'] + return f, values diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index 97b6687d061e9..f3ea7ad792160 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -1,4 +1,4 @@ -from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t +from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t # prototypes for sharing @@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + cpdef get_item(self, object val) + cpdef set_item(self, object key, Py_ssize_t val) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 3bda3f49cb054..ce760b49fabc0 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check from khash cimport * from numpy cimport * -from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free + +from libc.stdlib cimport malloc, free +from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free, + PyString_Check, PyBytes_Check, + PyUnicode_Check) from util cimport _checknan cimport util @@ -33,7 +37,7 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) -cdef size_t _INIT_VEC_CAP = 32 +cdef size_t _INIT_VEC_CAP = 128 include "hashtable_class_helper.pxi" diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 14e5363eee20c..22714e6305677 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -10,23 +10,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype -dtypes = [('Float64', 'float64'), ('Int64', 'int64')] - +# name, dtype, arg +# the generated StringVector is not actually used +# but is included for completeness (rather ObjectVector is used +# for uniques in hashtables) + +dtypes = [('Float64', 'float64', 'float64_t'), + ('Int64', 'int64', 'int64_t'), + ('String', 'string', 'char *')] }} -{{for name, dtype in dtypes}} +{{for name, dtype, arg in dtypes}} ctypedef struct {{name}}VectorData: - {{dtype}}_t *data + {{arg}} *data size_t n, m @cython.wraparound(False) @cython.boundscheck(False) -cdef void append_data_{{dtype}}({{name}}VectorData *data, - {{dtype}}_t x) nogil: +cdef inline void append_data_{{dtype}}({{name}}VectorData *data, + {{arg}} x) nogil: data.data[data.n] = x data.n += 1 @@ -36,8 +41,9 @@ cdef void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData Float64VectorData + StringVectorData -cdef bint needs_resize(vector_data *data) nogil: +cdef inline bint needs_resize(vector_data *data) nogil: return data.n == data.m #---------------------------------------------------------------------- @@ -46,12 +52,13 @@ cdef bint needs_resize(vector_data *data) nogil: {{py: -# name, dtype -dtypes = [('Float64', 'float64'), ('Int64', 'int64')] +# name, dtype, arg, idtype +dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), + ('Int64', 'int64', 'int64_t', 'np.int64')] }} -{{for name, dtype in dtypes}} +{{for name, dtype, arg, idtype in dtypes}} cdef class {{name}}Vector: @@ -66,13 +73,13 @@ cdef class {{name}}Vector: raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) - self.data.data = <{{dtype}}_t*> self.ao.data + self.ao = np.empty(self.data.m, dtype={{idtype}}) + self.data.data = <{{arg}}*> self.ao.data cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.ao.resize(self.data.m) - self.data.data = <{{dtype}}_t*> self.ao.data + self.data.data = <{{arg}}*> self.ao.data def __dealloc__(self): PyMem_Free(self.data) @@ -85,7 +92,7 @@ cdef class {{name}}Vector: self.data.m = self.data.n return self.ao - cdef inline void append(self, {{dtype}}_t x): + cdef inline void append(self, {{arg}} x): if needs_resize(self.data): self.resize() @@ -94,6 +101,61 @@ cdef class {{name}}Vector: {{endfor}} +cdef class StringVector: + + cdef: + StringVectorData *data + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(StringVectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.data.data = malloc(self.data.m * sizeof(char *)) + + cdef resize(self): + cdef: + char **orig_data + size_t i, m + + m = self.data.m + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + + # TODO: can resize? + orig_data = self.data.data + self.data.data = malloc(self.data.m * sizeof(char *)) + for i in range(m): + self.data.data[i] = orig_data[i] + + def __dealloc__(self): + free(self.data.data) + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + cdef: + ndarray ao + size_t n + object val + + ao = np.empty(self.data.n, dtype=np.object) + for i in range(self.data.n): + val = self.data.data[i] + ao[i] = val + self.data.m = self.data.n + return ao + + cdef inline void append(self, char * x): + + if needs_resize(self.data): + self.resize() + + append_data_string(self.data, x) + cdef class ObjectVector: @@ -377,9 +439,11 @@ cdef class {{name}}HashTable(HashTable): cdef class StringHashTable(HashTable): - cdef kh_str_t *table + # these by-definition *must* be strings + # or a sentinel np.nan / None missing value + na_string_sentinel = '__nan__' - def __cinit__(self, int size_hint=1): + def __init__(self, int size_hint=1): self.table = kh_init_str() if size_hint is not None: kh_resize_str(self.table, size_hint) @@ -388,17 +452,26 @@ cdef class StringHashTable(HashTable): kh_destroy_str(self.table) cpdef get_item(self, object val): - cdef khiter_t k - k = kh_get_str(self.table, util.get_c_string(val)) + cdef: + khiter_t k + char *v + v = util.get_c_string(val) + + k = kh_get_str(self.table, v) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val + cdef: + Py_ssize_t i, val + char *v + + v = util.get_c_string(key) + for i in range(iterations): - k = kh_get_str(self.table, util.get_c_string(key)) + k = kh_get_str(self.table, v) if k != self.table.n_buckets: val = self.table.vals[k] @@ -406,83 +479,203 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k int ret = 0 - char* buf + char *v - buf = util.get_c_string(key) + v = util.get_c_string(val) - k = kh_put_str(self.table, buf, &ret) + k = kh_put_str(self.table, v, &ret) self.table.keys[k] = key if kh_exist_str(self.table, k): self.table.vals[k] = val else: raise KeyError(key) + @cython.boundscheck(False) def get_indexer(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - char *buf int64_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table + char *v + char **vecs + vecs = malloc(n * sizeof(char *)) for i in range(n): - buf = util.get_c_string(values[i]) - k = kh_get_str(table, buf) - if k != table.n_buckets: - resbuf[i] = table.vals[k] - else: - resbuf[i] = -1 + val = values[i] + v = util.get_c_string(val) + vecs[i] = v + + with nogil: + for i in range(n): + k = kh_get_str(table, vecs[i]) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + + free(vecs) return labels + @cython.boundscheck(False) def unique(self, ndarray[object] values): cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, count, n = len(values) + int64_t[:] uindexer int ret = 0 object val - char *buf + ObjectVector uniques khiter_t k - ObjectVector uniques = ObjectVector() + char *v + char **vecs + vecs = malloc(n * sizeof(char *)) + uindexer = np.empty(n, dtype=np.int64) for i in range(n): val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k == self.table.n_buckets: - kh_put_str(self.table, buf, &ret) - uniques.append(val) + v = util.get_c_string(val) + vecs[i] = v + + count = 0 + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if k == self.table.n_buckets: + kh_put_str(self.table, v, &ret) + uindexer[count] = i + count += 1 + free(vecs) + # uniques + uniques = ObjectVector() + for i in range(count): + uniques.append(values[uindexer[i]]) return uniques.to_array() def factorize(self, ndarray[object] values): + uniques = ObjectVector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def lookup(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - dict reverse = {} - Py_ssize_t idx, count = 0 int ret = 0 object val - char *buf + char *v khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + # these by-definition *must* be strings + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx + + if PyUnicode_Check(val) or PyString_Check(val): + v = util.get_c_string(val) else: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) + v = util.get_c_string(self.na_string_sentinel) + vecs[i] = v - self.table.vals[k] = count - reverse[count] = val - labels[i] = count - count += 1 + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 - return reverse, labels + free(vecs) + return np.asarray(locs) + @cython.boundscheck(False) + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *v + char **vecs + khiter_t k + + # these by-definition *must* be strings + vecs = malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + + if PyUnicode_Check(val) or PyString_Check(val): + v = util.get_c_string(val) + else: + v = util.get_c_string(self.na_string_sentinel) + vecs[i] = v + + with nogil: + for i in range(n): + v = vecs[i] + k = kh_put_str(self.table, v, &ret) + self.table.vals[k] = i + free(vecs) + + @cython.boundscheck(False) + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=1): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + int64_t[:] uindexer + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + char *v + char **vecs + khiter_t k + + # these by-definition *must* be strings + labels = np.zeros(n, dtype=np.int64) + uindexer = np.empty(n, dtype=np.int64) + + # pre-filter out missing + # and assign pointers + vecs = malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + + if PyUnicode_Check(val) or PyString_Check(val): + v = util.get_c_string(val) + vecs[i] = v + else: + labels[i] = na_sentinel + + # compute + with nogil: + for i in range(n): + if labels[i] == na_sentinel: + continue + + v = vecs[i] + k = kh_get_str(self.table, v) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, v, &ret) + self.table.vals[k] = count + uindexer[count] = i + labels[i] = count + count += 1 + + free(vecs) + + # uniques + for i in range(count): + uniques.append(values[uindexer[i]]) + + return np.asarray(labels) na_sentinel = object @@ -639,4 +832,4 @@ cdef class PyObjectHashTable(HashTable): labels[i] = count count += 1 - return np.asarray(labels) \ No newline at end of file + return np.asarray(labels)