[SPARK-48710][PYTHON] Use NumPy 2.0 compatible types

codesorcery · HyukjinKwon · commit 0aa32e42f97c · 2024-07-03T09:23:26.000+09:00
### What changes were proposed in this pull request? * Replace NumPy types removed in NumPy 2.0 with their equivalent counterparts * Make tests compatible to new `__repr__` of numerical scalars ### Why are the changes needed? PySpark references some code which was removed with NumPy 2.0: * `np.NaN` was removed, should be replaced with `np.nan` * `np.string_` was removed, [is an alias for](https://github.com/numpy/numpy/blob/v1.26.5/numpy/__init__.pyi#L3134) `np.bytes_` * `np.float_` was removed, [is defined the same as](https://github.com/numpy/numpy/blob/v1.26.5/numpy/__init__.pyi#L3042-3043) `np.double` * `np.unicode_` was removed, [is an alias for](https://github.com/numpy/numpy/blob/v1.26.5/numpy/__init__.pyi#L3148) `np.str_` NumPy 2.0 changed the `__repr__` of numerical scalars to contain type information (e.g. `np.int32(3)` instead of `3`). Old behavior can be enabled by setting `numpy.printoptions(legacy="1.25")` (or the older `1.21` and `1.13` legacy modes). There are multiple tests and doctests that rely on the old behavior. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tests for modules `pyspark-connect`, `pyspark-core`, `pyspark-errors`, `pyspark-mllib`, `pyspark-pandas`, `pyspark-sql`, `pyspark-resource`, `pyspark-testing` were executed in a local venv with `numpy==2.0.0` installed. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47083 from codesorcery/SPARK-48710. Authored-by: Patrick Marx <6949483+codesorcery@users.noreply.github.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/core/rdd.py b/python/pyspark/core/rdd.py
@@ -5370,6 +5370,18 @@ def _test() -> None:
     import tempfile
     from pyspark.core.context import SparkContext
 
+    try:
+        # Numpy 2.0+ changed its string format,
+        # adding type information to numeric scalars.
+        import numpy as np
+        from pandas.util.version import Version
+
+        if Version(np.__version__) >= Version("2"):
+            # `legacy="1.25"` only available in `nump>=2`
+            np.set_printoptions(legacy="1.25")  # type: ignore[arg-type]
+    except TypeError:
+        pass
+
     tmp_dir = tempfile.TemporaryDirectory()
     globs = globals().copy()
     # The small batch size here ensures that we see multiple batches,
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
@@ -115,7 +115,7 @@ def _can_convert_to_list(value: Any) -> bool:
     @staticmethod
     def _can_convert_to_string(value: Any) -> bool:
         vtype = type(value)
-        return isinstance(value, str) or vtype in [np.unicode_, np.string_, np.str_]
+        return isinstance(value, str) or vtype in [np.bytes_, np.str_]
 
     @staticmethod
     def identity(value: "T") -> "T":
@@ -230,7 +230,7 @@ def toString(value: Any) -> str:
         """
         if isinstance(value, str):
             return value
-        elif type(value) in [np.string_, np.str_, np.unicode_]:
+        elif type(value) in [np.bytes_, np.str_]:
             return str(value)
         else:
             raise TypeError("Could not convert %s to string type" % type(value))
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
@@ -903,7 +903,7 @@ def isnull(self: IndexOpsLike) -> IndexOpsLike:
 
         Examples
         --------
-        >>> ser = ps.Series([5, 6, np.NaN])
+        >>> ser = ps.Series([5, 6, np.nan])
         >>> ser.isna()  # doctest: +NORMALIZE_WHITESPACE
         0    False
         1    False
@@ -939,7 +939,7 @@ def notnull(self: IndexOpsLike) -> IndexOpsLike:
         --------
         Show which entries in a Series are not NA.
 
-        >>> ser = ps.Series([5, 6, np.NaN])
+        >>> ser = ps.Series([5, 6, np.nan])
         >>> ser
         0    5.0
         1    6.0
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -10064,7 +10064,7 @@ def reindex(
             number (0, 1).
         copy : bool, default True
             Return a new object, even if the passed indexes are the same.
-        fill_value : scalar, default np.NaN
+        fill_value : scalar, default np.nan
             Value to use for missing values. Defaults to NaN, but can be any
             "compatible" value.
 
diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py
@@ -2645,9 +2645,16 @@ def _test() -> None:
     import sys
     from pyspark.sql import SparkSession
     import pyspark.pandas.indexes.base
+    from pandas.util.version import Version
 
     os.chdir(os.environ["SPARK_HOME"])
 
+    if Version(np.__version__) >= Version("2"):
+        # Numpy 2.0+ changed its string format,
+        # adding type information to numeric scalars.
+        # `legacy="1.25"` only available in `nump>=2`
+        np.set_printoptions(legacy="1.25")  # type: ignore[arg-type]
+
     globs = pyspark.pandas.indexes.base.__dict__.copy()
     globs["ps"] = pyspark.pandas
     spark = (
diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py
@@ -1833,9 +1833,16 @@ def _test() -> None:
     import sys
     from pyspark.sql import SparkSession
     import pyspark.pandas.indexing
+    from pandas.util.version import Version
 
     os.chdir(os.environ["SPARK_HOME"])
 
+    if Version(np.__version__) >= Version("2"):
+        # Numpy 2.0+ changed its string format,
+        # adding type information to numeric scalars.
+        # `legacy="1.25"` only available in `nump>=2`
+        np.set_printoptions(legacy="1.25")  # type: ignore[arg-type]
+
     globs = pyspark.pandas.indexing.__dict__.copy()
     globs["ps"] = pyspark.pandas
     spark = (
diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
@@ -2812,7 +2812,7 @@ def notna(obj):
     --------
     Show which entries in a DataFrame are not NA.
 
-    >>> df = ps.DataFrame({'age': [5, 6, np.NaN],
+    >>> df = ps.DataFrame({'age': [5, 6, np.nan],
     ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
     ...                             pd.Timestamp('1940-04-25')],
     ...                    'name': ['Alfred', 'Batman', ''],
@@ -2831,7 +2831,7 @@ def notna(obj):
 
     Show which entries in a Series are not NA.
 
-    >>> ser = ps.Series([5, 6, np.NaN])
+    >>> ser = ps.Series([5, 6, np.nan])
     >>> ser
     0    5.0
     1    6.0
@@ -3731,9 +3731,16 @@ def _test() -> None:
     import uuid
     from pyspark.sql import SparkSession
     import pyspark.pandas.namespace
+    from pandas.util.version import Version
 
     os.chdir(os.environ["SPARK_HOME"])
 
+    if Version(np.__version__) >= Version("2"):
+        # Numpy 2.0+ changed its string format,
+        # adding type information to numeric scalars.
+        # `legacy="1.25"` only available in `nump>=2`
+        np.set_printoptions(legacy="1.25")  # type: ignore[arg-type]
+
     globs = pyspark.pandas.namespace.__dict__.copy()
     globs["ps"] = pyspark.pandas
     globs["sf"] = F
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -1893,7 +1893,7 @@ def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None)
         index: array-like, optional
             New labels / index to conform to, should be specified using keywords.
             Preferably an Index object to avoid duplicating data
-        fill_value : scalar, default np.NaN
+        fill_value : scalar, default np.nan
             Value to use for missing values. Defaults to NaN, but can be any
             "compatible" value.
 
diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
@@ -862,7 +862,7 @@ def contains(
         --------
         Returning a Series of booleans using only a literal pattern.
 
-        >>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
+        >>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
         >>> s1.str.contains('og', regex=False)
         0    False
         1     True
@@ -965,7 +965,7 @@ def count(self, pat: str, flags: int = 0) -> "ps.Series":
 
         Examples
         --------
-        >>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.NaN, 'CABA', 'cat'])
+        >>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
         >>> s.str.count('a')
         0    0.0
         1    0.0
@@ -1327,7 +1327,7 @@ def pandas_ljust(s) -> ps.Series[str]:  # type: ignore[no-untyped-def]
 
         return self._data.pandas_on_spark.transform_batch(pandas_ljust)
 
-    def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -> "ps.Series":
+    def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.nan) -> "ps.Series":
         """
         Determine if each string matches a regular expression.
 
@@ -1353,7 +1353,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -
 
         Examples
         --------
-        >>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
+        >>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
         >>> s.str.match('dog')
         0    False
         1     True
diff --git a/python/pyspark/pandas/tests/indexes/test_astype.py b/python/pyspark/pandas/tests/indexes/test_astype.py
@@ -55,7 +55,7 @@ def test_astype(self):
         self.assert_eq(psidx.astype(bool), pidx.astype(bool))
         self.assert_eq(psidx.astype("bool"), pidx.astype("bool"))
         self.assert_eq(psidx.astype("?"), pidx.astype("?"))
-        self.assert_eq(psidx.astype(np.unicode_), pidx.astype(np.unicode_))
+        self.assert_eq(psidx.astype(np.str_), pidx.astype(np.str_))
         self.assert_eq(psidx.astype("str"), pidx.astype("str"))
         self.assert_eq(psidx.astype("U"), pidx.astype("U"))
 
diff --git a/python/pyspark/pandas/tests/series/test_arg_ops.py b/python/pyspark/pandas/tests/series/test_arg_ops.py
@@ -149,7 +149,7 @@ def test_argmin_argmax(self):
         self.assert_eq(pser.argmax(), psser.argmax())
         self.assert_eq(pser.argmax(skipna=False), psser.argmax(skipna=False))
 
-        pser2 = pd.Series([np.NaN, 1.0, 2.0, np.NaN])
+        pser2 = pd.Series([np.nan, 1.0, 2.0, np.nan])
         psser2 = ps.from_pandas(pser2)
         self.assert_eq(pser2.argmin(), psser2.argmin())
         self.assert_eq(pser2.argmax(), psser2.argmax())
diff --git a/python/pyspark/pandas/tests/series/test_as_of.py b/python/pyspark/pandas/tests/series/test_as_of.py
@@ -32,7 +32,7 @@ def test_asof(self):
         self.assert_eq(psser.asof(20), pser.asof(20))
         self.assert_eq(psser.asof([5, 20]).sort_index(), pser.asof([5, 20]).sort_index())
         self.assert_eq(psser.asof(100), pser.asof(100))
-        self.assert_eq(repr(psser.asof(-100)), repr(pser.asof(-100)))
+        self.assert_eq(str(psser.asof(-100)), str(pser.asof(-100)))
         self.assert_eq(psser.asof([-100, 100]).sort_index(), pser.asof([-100, 100]).sort_index())
 
         # where cannot be an Index, Series or a DataFrame
@@ -55,7 +55,7 @@ def test_asof(self):
 
         self.assert_eq(psser.asof("2014-01-01"), pser.asof("2014-01-01"))
         self.assert_eq(psser.asof("2014-01-02"), pser.asof("2014-01-02"))
-        self.assert_eq(repr(psser.asof("1999-01-02")), repr(pser.asof("1999-01-02")))
+        self.assert_eq(str(psser.asof("1999-01-02")), str(pser.asof("1999-01-02")))
 
         # SPARK-37482: Skip check monotonic increasing for Series.asof with 'compute.eager_check'
         pser = pd.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40])
diff --git a/python/pyspark/pandas/tests/series/test_as_type.py b/python/pyspark/pandas/tests/series/test_as_type.py
@@ -126,7 +126,7 @@ def _test_numeric_astype(self, pser):
         self.assert_eq(psser.astype(bool), pser.astype(bool))
         self.assert_eq(psser.astype("bool"), pser.astype("bool"))
         self.assert_eq(psser.astype("?"), pser.astype("?"))
-        self.assert_eq(psser.astype(np.unicode_), pser.astype(np.unicode_))
+        self.assert_eq(psser.astype(np.str_), pser.astype(np.str_))
         self.assert_eq(psser.astype("str"), pser.astype("str"))
         self.assert_eq(psser.astype("U"), pser.astype("U"))
 
diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -38,7 +38,7 @@ def pser(self):
                 "\nleading-whitespace",
                 "trailing-Whitespace    \t",
                 None,
-                np.NaN,
+                np.nan,
             ]
         )
 
diff --git a/python/pyspark/pandas/tests/series/test_string_ops_basic.py b/python/pyspark/pandas/tests/series/test_string_ops_basic.py
@@ -38,7 +38,7 @@ def pser(self):
                 "\nleading-whitespace",
                 "trailing-Whitespace    \t",
                 None,
-                np.NaN,
+                np.nan,
             ]
         )
 
diff --git a/python/pyspark/pandas/tests/test_typedef.py b/python/pyspark/pandas/tests/test_typedef.py
@@ -93,12 +93,12 @@ def func() -> pd.Series[float]:
         self.assertEqual(inferred.dtype, np.float64)
         self.assertEqual(inferred.spark_type, DoubleType())
 
-        def func() -> "pd.DataFrame[np.float_, str]":
+        def func() -> "pd.DataFrame[np.float64, str]":
             pass
 
         expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
         inferred = infer_return_type(func)
-        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
+        self.assertEqual(inferred.dtypes, [np.float64, np.str_])
         self.assertEqual(inferred.spark_type, expected)
 
         def func() -> "pandas.DataFrame[float]":
@@ -121,10 +121,10 @@ def func() -> pd.DataFrame[np.float64, str]:
 
         expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
         inferred = infer_return_type(func)
-        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
+        self.assertEqual(inferred.dtypes, [np.float64, np.str_])
         self.assertEqual(inferred.spark_type, expected)
 
-        def func() -> pd.DataFrame[np.float_]:
+        def func() -> pd.DataFrame[np.float64]:
             pass
 
         expected = StructType([StructField("c0", DoubleType())])
@@ -167,12 +167,12 @@ def test_if_pandas_implements_class_getitem(self):
         assert not ps._series_has_class_getitem
 
     def test_infer_schema_with_names_pandas_instances(self):
-        def func() -> 'pd.DataFrame["a" : np.float_, "b":str]':  # noqa: F405
+        def func() -> 'pd.DataFrame["a" : np.float64, "b":str]':  # noqa: F405
             pass
 
         expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())])
         inferred = infer_return_type(func)
-        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
+        self.assertEqual(inferred.dtypes, [np.float64, np.str_])
         self.assertEqual(inferred.spark_type, expected)
 
         def func() -> "pd.DataFrame['a': float, 'b': int]":  # noqa: F405
@@ -217,7 +217,7 @@ def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
 
     def test_infer_schema_with_names_pandas_instances_negative(self):
         def try_infer_return_type():
-            def f() -> 'pd.DataFrame["a" : np.float_ : 1, "b":str:2]':  # noqa: F405
+            def f() -> 'pd.DataFrame["a" : np.float64 : 1, "b":str:2]':  # noqa: F405
                 pass
 
             infer_return_type(f)
@@ -283,7 +283,7 @@ def f() -> ps.DataFrame[A]:
         self.assertRaisesRegex(TypeError, "not understood", try_infer_return_type)
 
         def try_infer_return_type():
-            def f() -> 'ps.DataFrame["a" : np.float_ : 1, "b":str:2]':  # noqa: F405
+            def f() -> 'ps.DataFrame["a" : np.float64 : 1, "b":str:2]':  # noqa: F405
                 pass
 
             infer_return_type(f)
@@ -314,7 +314,6 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
             # binary
             np.character: (np.character, BinaryType()),
             np.bytes_: (np.bytes_, BinaryType()),
-            np.string_: (np.bytes_, BinaryType()),
             bytes: (np.bytes_, BinaryType()),
             # integer
             np.int8: (np.int8, ByteType()),
@@ -328,8 +327,8 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
             np.float64: (np.float64, DoubleType()),
             float: (np.float64, DoubleType()),
             # string
-            np.unicode_: (np.unicode_, StringType()),
-            str: (np.unicode_, StringType()),
+            np.str_: (np.str_, StringType()),
+            str: (np.str_, StringType()),
             # bool
             bool: (np.bool_, BooleanType()),
             # datetime
diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py
@@ -176,7 +176,7 @@ def as_spark_type(
             return None
         return types.ArrayType(element_type)
     # BinaryType
-    elif tpe in (bytes, np.character, np.bytes_, np.string_):
+    elif tpe in (bytes, np.character, np.bytes_):
         return types.BinaryType()
     # BooleanType
     elif tpe in (bool, np.bool_, "bool", "?"):
@@ -190,7 +190,7 @@ def as_spark_type(
     elif tpe in (decimal.Decimal,):
         # TODO: considering the precision & scale for decimal type.
         return types.DecimalType(38, 18)
-    elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
+    elif tpe in (float, np.double, np.float64, "float", "float64", "double"):
         return types.DoubleType()
     elif tpe in (np.float32, "float32", "f"):
         return types.FloatType()
@@ -201,7 +201,7 @@ def as_spark_type(
     elif tpe in (np.int16, "int16", "short"):
         return types.ShortType()
     # StringType
-    elif tpe in (str, np.unicode_, "str", "U"):
+    elif tpe in (str, np.str_, "str", "U"):
         return types.StringType()
     # TimestampType or TimestampNTZType if timezone is not specified.
     elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M", pd.Timestamp):
diff --git a/python/pyspark/sql/tests/test_arrow_python_udf.py b/python/pyspark/sql/tests/test_arrow_python_udf.py
@@ -60,7 +60,7 @@ def test_complex_input_types(self):
             .first()
         )
 
-        self.assertEqual(row[0], "[1, 2, 3]")
+        self.assertIn(row[0], ["[1, 2, 3]", "[np.int32(1), np.int32(2), np.int32(3)]"])
         self.assertEqual(row[1], "{'a': 'b'}")
         self.assertEqual(row[2], "Row(col1=1, col2=2)")
 
@@ -119,9 +119,10 @@ def test_register(self):
         str_repr_func = self.spark.udf.register("str_repr", udf(lambda x: str(x), useArrow=True))
 
         # To verify that Arrow optimization is on
-        self.assertEqual(
+        self.assertIn(
             df.selectExpr("str_repr(array) AS str_id").first()[0],
-            "[1, 2, 3]",  # The input is a NumPy array when the Arrow optimization is on
+            ["[1, 2, 3]", "[np.int32(1), np.int32(2), np.int32(3)]"],
+            # The input is a NumPy array when the Arrow optimization is on
         )
 
         # To verify that a UserDefinedFunction is returned
@@ -132,11 +133,14 @@ def test_register(self):
 
     def test_nested_array_input(self):
         df = self.spark.range(1).selectExpr("array(array(1, 2), array(3, 4)) as nested_array")
-        self.assertEqual(
+        self.assertIn(
             df.select(
                 udf(lambda x: str(x), returnType="string", useArrow=True)("nested_array")
             ).first()[0],
-            "[[1, 2], [3, 4]]",
+            [
+                "[[1, 2], [3, 4]]",
+                "[[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]]",
+            ],
         )
 
     def test_type_coercion_string_to_numeric(self):
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
@@ -904,7 +904,9 @@ def test_nested_array(self):
         df = self.spark.range(1).selectExpr("array(array(1, 2), array(3, 4)) as nested_array")
         # Input
         row = df.select(udf(lambda x: str(x))("nested_array")).first()
-        self.assertEqual(row[0], "[[1, 2], [3, 4]]")
+        self.assertIn(
+            row[0], ["[[1, 2], [3, 4]]", "[[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]]"]
+        )
         # Output
 
         @udf(returnType=df.dtypes[0][1])
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def pser(self):`
`38`	`38`	`"\nleading-whitespace",`
`39`	`39`	`"trailing-Whitespace \t",`
`40`	`40`	`None,`
`41`		`- np.NaN,`
	`41`	`+ np.nan,`
`42`	`42`	`]`
`43`	`43`	`)`
`44`	`44`