Skip to content

PERF: use StringHasTable for strings #14859

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class Algorithms(object):

def setup(self):
N = 100000
np.random.seed(1234)

self.int_unique = pd.Int64Index(np.arange(N * 5))
# cache is_unique
Expand All @@ -23,11 +24,15 @@ def setup(self):
self.arrpos = np.arange(1000000)
self.arrneg = np.arange(-1000000, 0)
self.arrmixed = np.array([1, -1]).repeat(500000)
self.strings = tm.makeStringIndex(100000)

# match
self.uniques = tm.makeStringIndex(1000).values
self.all = self.uniques.repeat(10)

def time_factorize_string(self):
self.strings.factorize()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be factorize_string


def time_factorize_int(self):
self.int.factorize()

Expand Down
35 changes: 35 additions & 0 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,3 +379,38 @@ def pg_read_csv_datetime(self):

def time_read_csv_datetime(self):
self.pg_read_csv_datetime()


class nogil_factorize(object):
number = 1
repeat = 5

def setup(self):
if (not have_real_test_parallel):
raise NotImplementedError

np.random.seed(1234)
self.strings = tm.makeStringIndex(100000)

def factorize_strings(self):
pd.factorize(self.strings)

@test_parallel(num_threads=4)
def _pg_factorize_strings_4(self):
self.factorize_strings()

def time_factorize_strings_4(self):
for i in range(2):
self._pg_factorize_strings_4()

@test_parallel(num_threads=2)
def _pg_factorize_strings_2(self):
self.factorize_strings()

def time_factorize_strings_2(self):
for i in range(4):
self._pg_factorize_strings_2()

def time_factorize_strings(self):
for i in range(8):
self.factorize_strings()
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)

- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add number


Expand Down
33 changes: 27 additions & 6 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
values = np.array(values, dtype='O')

f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
result = _hashtable_algo(f, values.dtype, np.int64)
result = _hashtable_algo(f, values, np.int64)

if na_sentinel != -1:

Expand Down Expand Up @@ -102,7 +102,7 @@ def unique(values):
values = com._asarray_tuplesafe(values)

f = lambda htype, caster: _unique_generic(values, htype, caster)
return _hashtable_algo(f, values.dtype)
return _hashtable_algo(f, values)


def _unique_generic(values, table_type, type_caster):
Expand Down Expand Up @@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
# helpers #
# ------- #

def _hashtable_algo(f, dtype, return_dtype=None):
def _hashtable_algo(f, values, return_dtype=None):
"""
f(HashTable, type_caster) -> result
"""

dtype = values.dtype
if is_float_dtype(dtype):
return f(htable.Float64HashTable, _ensure_float64)
elif is_integer_dtype(dtype):
Expand All @@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None):
elif is_timedelta64_dtype(dtype):
return_dtype = return_dtype or 'm8[ns]'
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
else:
return f(htable.PyObjectHashTable, _ensure_object)

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
return f(htable.StringHashTable, _ensure_object)

# use Object
return f(htable.PyObjectHashTable, _ensure_object)

_hashtables = {
'float64': (htable.Float64HashTable, htable.Float64Vector),
'int64': (htable.Int64HashTable, htable.Int64Vector),
'string': (htable.StringHashTable, htable.ObjectVector),
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
}


def _get_data_algo(values, func_map):

f = None
if is_float_dtype(values):
f = func_map['float64']
values = _ensure_float64(values)
Expand All @@ -796,8 +806,19 @@ def _get_data_algo(values, func_map):
f = func_map['int64']
values = _ensure_int64(values)
else:
f = func_map['generic']

values = _ensure_object(values)

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
try:
f = func_map['string']
except KeyError:
pass

if f is None:
f = func_map['generic']

return f, values


Expand Down
8 changes: 7 additions & 1 deletion pandas/hashtable.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t

# prototypes for sharing

Expand All @@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable):

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)

cdef class StringHashTable(HashTable):
cdef kh_str_t *table

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
8 changes: 6 additions & 2 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check

from khash cimport *
from numpy cimport *
from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free

from libc.stdlib cimport malloc, free
from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
PyString_Check, PyBytes_Check,
PyUnicode_Check)

from util cimport _checknan
cimport util
Expand Down Expand Up @@ -33,7 +37,7 @@ PyDateTime_IMPORT
cdef extern from "Python.h":
int PySlice_Check(object)

cdef size_t _INIT_VEC_CAP = 32
cdef size_t _INIT_VEC_CAP = 128


include "hashtable_class_helper.pxi"
Expand Down
Loading