From b072b8269c971f97074ff5c090ab1c4fcf265ded Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 17 Sep 2022 08:52:47 -0400 Subject: [PATCH 1/5] PERF: Only check isna in certain cases in factorize --- pandas/core/algorithms.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6a04cbf4b5846..3a7a6b9bc3ca5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -568,17 +568,6 @@ def factorize_array( hash_klass, values = _get_hashtable_algo(values) - # factorize can now handle differentiating various types of null values. - # However, for backwards compatibility we only use the null for the - # provided dtype. This may be revisited in the future, see GH#48476. - null_mask = isna(values) - if null_mask.any(): - na_value = na_value_for_dtype(values.dtype, compat=False) - # Don't modify (potentially user-provided) array - # error: No overload variant of "where" matches argument types "Any", "object", - # "ndarray[Any, Any]" - values = np.where(null_mask, na_value, values) # type: ignore[call-overload] - table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( values, @@ -812,6 +801,23 @@ def factorize( na_sentinel_arg = None else: na_sentinel_arg = na_sentinel + + if not dropna and not sort: + # factorize can now handle differentiating various types of null values. + # However, for backwards compatibility we only use the null for the + # provided dtype. This may be revisited in the future, see GH#48476. + null_mask = isna(values) + if null_mask.any(): + na_value = na_value_for_dtype(values.dtype, compat=False) + # Don't modify (potentially user-provided) array + # error: No overload variant of "where" matches argument types + # "Any", "object", "ndarray[Any, Any]" + values = np.where( # type: ignore[call-overload] + null_mask, + na_value, + values, + ) + codes, uniques = factorize_array( values, na_sentinel=na_sentinel_arg, From 65f4c4b9a4d133547930ed5939071e9949404b24 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 17 Sep 2022 08:53:53 -0400 Subject: [PATCH 2/5] Remove type-ignore --- pandas/core/algorithms.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3a7a6b9bc3ca5..00c908bc1f44c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -810,9 +810,7 @@ def factorize( if null_mask.any(): na_value = na_value_for_dtype(values.dtype, compat=False) # Don't modify (potentially user-provided) array - # error: No overload variant of "where" matches argument types - # "Any", "object", "ndarray[Any, Any]" - values = np.where( # type: ignore[call-overload] + values = np.where( null_mask, na_value, values, From 3a609934cf97962edd12c1a22fe94b6da72f5148 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 17 Sep 2022 08:54:19 -0400 Subject: [PATCH 3/5] Reformat --- pandas/core/algorithms.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 00c908bc1f44c..93927f1076e31 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -810,11 +810,7 @@ def factorize( if null_mask.any(): na_value = na_value_for_dtype(values.dtype, compat=False) # Don't modify (potentially user-provided) array - values = np.where( - null_mask, - na_value, - values, - ) + values = np.where(null_mask, na_value, values) codes, uniques = factorize_array( values, From 135b1d35f358bd828e76f34fa2afb1a3143465a6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 18 Sep 2022 10:58:37 -0400 Subject: [PATCH 4/5] Check for object dtype --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 93927f1076e31..a73a85efacc9d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -802,8 +802,9 @@ def factorize( else: na_sentinel_arg = na_sentinel - if not dropna and not sort: + if not dropna and not sort and is_object_dtype(values): # factorize can now handle differentiating various types of null values. + # These can only occur when the array has object dtype. # However, for backwards compatibility we only use the null for the # provided dtype. This may be revisited in the future, see GH#48476. null_mask = isna(values) From 58c63771d2acfccbbd3ca106e63cff13b5a99134 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 19 Sep 2022 23:20:28 -0400 Subject: [PATCH 5/5] whatsnew note --- doc/source/whatsnew/v1.5.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index f8069b5476d9e..5aaf9e629eb5a 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`) - .. ---------------------------------------------------------------------------