diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 1bbffaa7bb5d2..e402a4b7c0ccc 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -66,13 +66,18 @@ cdef class Factorizer:
         self.uniques = ObjectVector()
         self.count = 0
 
-    def get_count(self):
+    def get_count(self) -> int:
         return self.count
 
     def factorize(
         self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
-    ):
+    ) -> np.ndarray:
         """
+
+        Returns
+        -------
+        np.ndarray[np.intp]
+
         Examples
         --------
         Factorize values with nans replaced by na_sentinel
@@ -80,6 +85,9 @@ cdef class Factorizer:
         >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
         array([ 0,  1, 20])
         """
+        cdef:
+            ndarray[intp_t] labels
+
         if self.uniques.external_view_exists:
             uniques = ObjectVector()
             uniques.extend(self.uniques.to_array())
@@ -89,8 +97,6 @@ cdef class Factorizer:
         mask = (labels == na_sentinel)
         # sort on
         if sort:
-            if labels.dtype != np.intp:
-                labels = labels.astype(np.intp)
             sorter = self.uniques.to_array().argsort()
             reverse_indexer = np.empty(len(sorter), dtype=np.intp)
             reverse_indexer.put(sorter, np.arange(len(sorter)))
@@ -119,8 +125,12 @@ cdef class Int64Factorizer:
         return self.count
 
     def factorize(self, const int64_t[:] values, sort=False,
-                  na_sentinel=-1, na_value=None):
+                  na_sentinel=-1, na_value=None) -> np.ndarray:
         """
+        Returns
+        -------
+        ndarray[intp_t]
+
         Examples
         --------
         Factorize values with nans replaced by na_sentinel
@@ -128,6 +138,9 @@ cdef class Int64Factorizer:
         >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
         array([ 0,  1, 20])
         """
+        cdef:
+            ndarray[intp_t] labels
+
         if self.uniques.external_view_exists:
             uniques = Int64Vector()
             uniques.extend(self.uniques.to_array())
@@ -138,9 +151,6 @@ cdef class Int64Factorizer:
 
         # sort on
         if sort:
-            if labels.dtype != np.intp:
-                labels = labels.astype(np.intp)
-
             sorter = self.uniques.to_array().argsort()
             reverse_indexer = np.empty(len(sorter), dtype=np.intp)
             reverse_indexer.put(sorter, np.arange(len(sorter)))
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 0b6bb170cc531..6ace327ca3599 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -539,12 +539,12 @@ cdef class {{name}}HashTable(HashTable):
         -------
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse=True)
+        labels : ndarray[intp_t] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
-            int64_t[:] labels
+            intp_t[:] labels
             int ret = 0
             {{c_type}} val, na_value2
             khiter_t k
@@ -553,7 +553,7 @@ cdef class {{name}}HashTable(HashTable):
             uint8_t[:] mask_values
 
         if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
+            labels = np.empty(n, dtype=np.intp)
         ud = uniques.data
         use_na_value = na_value is not None
         use_mask = mask is not None
@@ -614,7 +614,7 @@ cdef class {{name}}HashTable(HashTable):
                     labels[i] = idx
 
         if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
+            return uniques.to_array(), labels.base  # .base -> underlying ndarray
         return uniques.to_array()
 
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
@@ -633,7 +633,7 @@ cdef class {{name}}HashTable(HashTable):
         -------
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
+        labels : ndarray[intp_t] (if return_inverse)
             The labels from values to uniques
         """
         uniques = {{name}}Vector()
@@ -668,7 +668,7 @@ cdef class {{name}}HashTable(HashTable):
         -------
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[intp_t]
             The labels from values to uniques
         """
         uniques_vector = {{name}}Vector()
@@ -918,12 +918,12 @@ cdef class StringHashTable(HashTable):
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse=True)
+        labels : ndarray[intp_t] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
-            int64_t[:] labels
+            intp_t[:] labels
             int64_t[:] uindexer
             int ret = 0
             object val
@@ -933,7 +933,7 @@ cdef class StringHashTable(HashTable):
             bint use_na_value
 
         if return_inverse:
-            labels = np.zeros(n, dtype=np.int64)
+            labels = np.zeros(n, dtype=np.intp)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
@@ -972,13 +972,13 @@ cdef class StringHashTable(HashTable):
                     uindexer[count] = i
                     if return_inverse:
                         self.table.vals[k] = count
-                        labels[i] = <int64_t>count
+                        labels[i] = count
                     count += 1
                 elif return_inverse:
                     # k falls into a previous bucket
                     # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
-                    labels[i] = <int64_t>idx
+                    labels[i] = idx
 
         free(vecs)
 
@@ -987,7 +987,7 @@ cdef class StringHashTable(HashTable):
             uniques.append(values[uindexer[i]])
 
         if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
+            return uniques.to_array(), labels.base  # .base -> underlying ndarray
         return uniques.to_array()
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
@@ -1193,19 +1193,19 @@ cdef class PyObjectHashTable(HashTable):
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse=True)
+        labels : ndarray[intp_t] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
-            int64_t[:] labels
+            intp_t[:] labels
             int ret = 0
             object val
             khiter_t k
             bint use_na_value
 
         if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
+            labels = np.empty(n, dtype=np.intp)
         use_na_value = na_value is not None
 
         for i in range(n):
@@ -1240,7 +1240,7 @@ cdef class PyObjectHashTable(HashTable):
                 labels[i] = idx
 
         if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
+            return uniques.to_array(), labels.base  # .base -> underlying ndarray
         return uniques.to_array()
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
@@ -1259,7 +1259,7 @@ cdef class PyObjectHashTable(HashTable):
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
+        labels : ndarray[intp_t] (if return_inverse)
             The labels from values to uniques
         """
         uniques = ObjectVector()
@@ -1292,7 +1292,7 @@ cdef class PyObjectHashTable(HashTable):
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[intp_t]
             The labels from values to uniques
         """
         uniques_vector = ObjectVector()
diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx
index 1b79d68c13570..511b373bc7e1f 100644
--- a/pandas/_libs/join.pyx
+++ b/pandas/_libs/join.pyx
@@ -10,6 +10,7 @@ from numpy cimport (
     int16_t,
     int32_t,
     int64_t,
+    intp_t,
     ndarray,
     uint8_t,
     uint16_t,
@@ -20,6 +21,7 @@ from numpy cimport (
 cnp.import_array()
 
 from pandas._libs.algos import (
+    ensure_int64,
     ensure_platform_int,
     groupsort_indexer,
     take_1d_int64_int64,
@@ -27,7 +29,7 @@ from pandas._libs.algos import (
 
 
 @cython.boundscheck(False)
-def inner_join(const int64_t[:] left, const int64_t[:] right,
+def inner_join(const intp_t[:] left, const intp_t[:] right,
                Py_ssize_t max_groups):
     cdef:
         Py_ssize_t i, j, k, count = 0
@@ -39,8 +41,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right,
 
     # NA group in location 0
 
-    left_sorter, left_count = groupsort_indexer(left, max_groups)
-    right_sorter, right_count = groupsort_indexer(right, max_groups)
+    left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
+    right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
 
     with nogil:
         # First pass, determine size of result set, do not use the NA group
@@ -78,7 +80,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right,
 
 
 @cython.boundscheck(False)
-def left_outer_join(const int64_t[:] left, const int64_t[:] right,
+def left_outer_join(const intp_t[:] left, const intp_t[:] right,
                     Py_ssize_t max_groups, bint sort=True):
     cdef:
         Py_ssize_t i, j, k, count = 0
@@ -91,8 +93,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right,
 
     # NA group in location 0
 
-    left_sorter, left_count = groupsort_indexer(left, max_groups)
-    right_sorter, right_count = groupsort_indexer(right, max_groups)
+    left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
+    right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
 
     with nogil:
         # First pass, determine size of result set, do not use the NA group
@@ -151,7 +153,7 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right,
 
 
 @cython.boundscheck(False)
-def full_outer_join(const int64_t[:] left, const int64_t[:] right,
+def full_outer_join(const intp_t[:] left, const intp_t[:] right,
                     Py_ssize_t max_groups):
     cdef:
         Py_ssize_t i, j, k, count = 0
@@ -163,8 +165,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right,
 
     # NA group in location 0
 
-    left_sorter, left_count = groupsort_indexer(left, max_groups)
-    right_sorter, right_count = groupsort_indexer(right, max_groups)
+    left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
+    right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
 
     with nogil:
         # First pass, determine size of result set, do not use the NA group
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index c01bf3931b27a..3c1279d62b126 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -1973,7 +1973,7 @@ def _get_single_indexer(join_key, index, sort: bool = False):
     left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
 
     left_indexer, right_indexer = libjoin.left_outer_join(
-        ensure_int64(left_key), ensure_int64(right_key), count, sort=sort
+        left_key, right_key, count, sort=sort
     )
 
     return left_indexer, right_indexer
@@ -2029,9 +2029,9 @@ def _factorize_keys(
 
     Returns
     -------
-    array
+    np.ndarray[np.intp]
         Left (resp. right if called with `key='right'`) labels, as enumerated type.
-    array
+    np.ndarray[np.intp]
         Right (resp. left if called with `key='right'`) labels, as enumerated type.
     int
         Number of unique elements in union of left and right labels.
@@ -2117,6 +2117,8 @@ def _factorize_keys(
 
     llab = rizer.factorize(lk)
     rlab = rizer.factorize(rk)
+    assert llab.dtype == np.intp, llab.dtype
+    assert rlab.dtype == np.intp, rlab.dtype
 
     count = rizer.get_count()
 
@@ -2142,13 +2144,16 @@ def _factorize_keys(
     return llab, rlab, count
 
 
-def _sort_labels(uniques: np.ndarray, left, right):
+def _sort_labels(
+    uniques: np.ndarray, left: np.ndarray, right: np.ndarray
+) -> tuple[np.ndarray, np.ndarray]:
+    # Both returned ndarrays are np.intp
 
     llength = len(left)
     labels = np.concatenate([left, right])
 
     _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
-    new_labels = ensure_int64(new_labels)
+    assert new_labels.dtype == np.intp
     new_left, new_right = new_labels[:llength], new_labels[llength:]
 
     return new_left, new_right
diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py
index 0bdb7b0e71e2d..f5426c71511bb 100644
--- a/pandas/tests/libs/test_join.py
+++ b/pandas/tests/libs/test_join.py
@@ -46,8 +46,8 @@ def test_outer_join_indexer(self, dtype):
         tm.assert_numpy_array_equal(rindexer, exp)
 
     def test_cython_left_outer_join(self):
-        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
-        right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
+        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
+        right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
         max_group = 5
 
         ls, rs = left_outer_join(left, right, max_group)
@@ -70,8 +70,8 @@ def test_cython_left_outer_join(self):
         tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
 
     def test_cython_right_outer_join(self):
-        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
-        right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
+        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
+        right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
         max_group = 5
 
         rs, ls = left_outer_join(right, left, max_group)
@@ -116,8 +116,8 @@ def test_cython_right_outer_join(self):
         tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
 
     def test_cython_inner_join(self):
-        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
-        right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
+        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
+        right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp)
         max_group = 5
 
         ls, rs = inner_join(left, right, max_group)
@@ -256,10 +256,10 @@ def test_left_outer_join_bug():
             0,
             2,
         ],
-        dtype=np.int64,
+        dtype=np.intp,
     )
 
-    right = np.array([3, 1], dtype=np.int64)
+    right = np.array([3, 1], dtype=np.intp)
     max_groups = 4
 
     lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)