diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 1bbffaa7bb5d2..e402a4b7c0ccc 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -66,13 +66,18 @@ cdef class Factorizer: self.uniques = ObjectVector() self.count = 0 - def get_count(self): + def get_count(self) -> int: return self.count def factorize( self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None - ): + ) -> np.ndarray: """ + + Returns + ------- + np.ndarray[np.intp] + Examples -------- Factorize values with nans replaced by na_sentinel @@ -80,6 +85,9 @@ cdef class Factorizer: >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ + cdef: + ndarray[intp_t] labels + if self.uniques.external_view_exists: uniques = ObjectVector() uniques.extend(self.uniques.to_array()) @@ -89,8 +97,6 @@ cdef class Factorizer: mask = (labels == na_sentinel) # sort on if sort: - if labels.dtype != np.intp: - labels = labels.astype(np.intp) sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -119,8 +125,12 @@ cdef class Int64Factorizer: return self.count def factorize(self, const int64_t[:] values, sort=False, - na_sentinel=-1, na_value=None): + na_sentinel=-1, na_value=None) -> np.ndarray: """ + Returns + ------- + ndarray[intp_t] + Examples -------- Factorize values with nans replaced by na_sentinel @@ -128,6 +138,9 @@ cdef class Int64Factorizer: >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ + cdef: + ndarray[intp_t] labels + if self.uniques.external_view_exists: uniques = Int64Vector() uniques.extend(self.uniques.to_array()) @@ -138,9 +151,6 @@ cdef class Int64Factorizer: # sort on if sort: - if labels.dtype != np.intp: - labels = labels.astype(np.intp) - sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0b6bb170cc531..6ace327ca3599 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -539,12 +539,12 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int ret = 0 {{c_type}} val, na_value2 khiter_t k @@ -553,7 +553,7 @@ cdef class {{name}}HashTable(HashTable): uint8_t[:] mask_values if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None @@ -614,7 +614,7 @@ cdef class {{name}}HashTable(HashTable): labels[i] = idx if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): @@ -633,7 +633,7 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = {{name}}Vector() @@ -668,7 +668,7 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = {{name}}Vector() @@ -918,12 +918,12 @@ cdef class StringHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int64_t[:] uindexer int ret = 0 object val @@ -933,7 +933,7 @@ cdef class StringHashTable(HashTable): bint use_na_value if return_inverse: - labels = np.zeros(n, dtype=np.int64) + labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -972,13 +972,13 @@ cdef class StringHashTable(HashTable): uindexer[count] = i if return_inverse: self.table.vals[k] = count - labels[i] = count + labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] - labels[i] = idx + labels[i] = idx free(vecs) @@ -987,7 +987,7 @@ cdef class StringHashTable(HashTable): uniques.append(values[uindexer[i]]) if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): @@ -1193,19 +1193,19 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int ret = 0 object val khiter_t k bint use_na_value if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None for i in range(n): @@ -1240,7 +1240,7 @@ cdef class PyObjectHashTable(HashTable): labels[i] = idx if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): @@ -1259,7 +1259,7 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() @@ -1292,7 +1292,7 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = ObjectVector() diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 1b79d68c13570..511b373bc7e1f 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -10,6 +10,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -20,6 +21,7 @@ from numpy cimport ( cnp.import_array() from pandas._libs.algos import ( + ensure_int64, ensure_platform_int, groupsort_indexer, take_1d_int64_int64, @@ -27,7 +29,7 @@ from pandas._libs.algos import ( @cython.boundscheck(False) -def inner_join(const int64_t[:] left, const int64_t[:] right, +def inner_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 @@ -39,8 +41,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -78,7 +80,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) -def left_outer_join(const int64_t[:] left, const int64_t[:] right, +def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 @@ -91,8 +93,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -151,7 +153,7 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) -def full_outer_join(const int64_t[:] left, const int64_t[:] right, +def full_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 @@ -163,8 +165,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c01bf3931b27a..3c1279d62b126 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1973,7 +1973,7 @@ def _get_single_indexer(join_key, index, sort: bool = False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( - ensure_int64(left_key), ensure_int64(right_key), count, sort=sort + left_key, right_key, count, sort=sort ) return left_indexer, right_indexer @@ -2029,9 +2029,9 @@ def _factorize_keys( Returns ------- - array + np.ndarray[np.intp] Left (resp. right if called with `key='right'`) labels, as enumerated type. - array + np.ndarray[np.intp] Right (resp. left if called with `key='right'`) labels, as enumerated type. int Number of unique elements in union of left and right labels. @@ -2117,6 +2117,8 @@ def _factorize_keys( llab = rizer.factorize(lk) rlab = rizer.factorize(rk) + assert llab.dtype == np.intp, llab.dtype + assert rlab.dtype == np.intp, rlab.dtype count = rizer.get_count() @@ -2142,13 +2144,16 @@ def _factorize_keys( return llab, rlab, count -def _sort_labels(uniques: np.ndarray, left, right): +def _sort_labels( + uniques: np.ndarray, left: np.ndarray, right: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + # Both returned ndarrays are np.intp llength = len(left) labels = np.concatenate([left, right]) _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) - new_labels = ensure_int64(new_labels) + assert new_labels.dtype == np.intp new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index 0bdb7b0e71e2d..f5426c71511bb 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -46,8 +46,8 @@ def test_outer_join_indexer(self, dtype): tm.assert_numpy_array_equal(rindexer, exp) def test_cython_left_outer_join(self): - left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp) max_group = 5 ls, rs = left_outer_join(left, right, max_group) @@ -70,8 +70,8 @@ def test_cython_left_outer_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): - left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp) max_group = 5 rs, ls = left_outer_join(right, left, max_group) @@ -116,8 +116,8 @@ def test_cython_right_outer_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): - left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp) + right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp) max_group = 5 ls, rs = inner_join(left, right, max_group) @@ -256,10 +256,10 @@ def test_left_outer_join_bug(): 0, 2, ], - dtype=np.int64, + dtype=np.intp, ) - right = np.array([3, 1], dtype=np.int64) + right = np.array([3, 1], dtype=np.intp) max_groups = 4 lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)