Skip to content

Commit 0aa32e4

Browse files
codesorceryHyukjinKwon
authored andcommitted
[SPARK-48710][PYTHON] Use NumPy 2.0 compatible types
### What changes were proposed in this pull request? * Replace NumPy types removed in NumPy 2.0 with their equivalent counterparts * Make tests compatible to new `__repr__` of numerical scalars ### Why are the changes needed? PySpark references some code which was removed with NumPy 2.0: * `np.NaN` was removed, should be replaced with `np.nan` * `np.string_` was removed, [is an alias for](https://github.com/numpy/numpy/blob/v1.26.5/numpy/__init__.pyi#L3134) `np.bytes_` * `np.float_` was removed, [is defined the same as](https://github.com/numpy/numpy/blob/v1.26.5/numpy/__init__.pyi#L3042-3043) `np.double` * `np.unicode_` was removed, [is an alias for](https://github.com/numpy/numpy/blob/v1.26.5/numpy/__init__.pyi#L3148) `np.str_` NumPy 2.0 changed the `__repr__` of numerical scalars to contain type information (e.g. `np.int32(3)` instead of `3`). Old behavior can be enabled by setting `numpy.printoptions(legacy="1.25")` (or the older `1.21` and `1.13` legacy modes). There are multiple tests and doctests that rely on the old behavior. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tests for modules `pyspark-connect`, `pyspark-core`, `pyspark-errors`, `pyspark-mllib`, `pyspark-pandas`, `pyspark-sql`, `pyspark-resource`, `pyspark-testing` were executed in a local venv with `numpy==2.0.0` installed. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47083 from codesorcery/SPARK-48710. Authored-by: Patrick Marx <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent fea930a commit 0aa32e4

20 files changed

+79
-41
lines changed

python/pyspark/core/rdd.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5370,6 +5370,18 @@ def _test() -> None:
53705370
import tempfile
53715371
from pyspark.core.context import SparkContext
53725372

5373+
try:
5374+
# Numpy 2.0+ changed its string format,
5375+
# adding type information to numeric scalars.
5376+
import numpy as np
5377+
from pandas.util.version import Version
5378+
5379+
if Version(np.__version__) >= Version("2"):
5380+
# `legacy="1.25"` only available in `nump>=2`
5381+
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]
5382+
except TypeError:
5383+
pass
5384+
53735385
tmp_dir = tempfile.TemporaryDirectory()
53745386
globs = globals().copy()
53755387
# The small batch size here ensures that we see multiple batches,

python/pyspark/ml/param/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def _can_convert_to_list(value: Any) -> bool:
115115
@staticmethod
116116
def _can_convert_to_string(value: Any) -> bool:
117117
vtype = type(value)
118-
return isinstance(value, str) or vtype in [np.unicode_, np.string_, np.str_]
118+
return isinstance(value, str) or vtype in [np.bytes_, np.str_]
119119

120120
@staticmethod
121121
def identity(value: "T") -> "T":
@@ -230,7 +230,7 @@ def toString(value: Any) -> str:
230230
"""
231231
if isinstance(value, str):
232232
return value
233-
elif type(value) in [np.string_, np.str_, np.unicode_]:
233+
elif type(value) in [np.bytes_, np.str_]:
234234
return str(value)
235235
else:
236236
raise TypeError("Could not convert %s to string type" % type(value))

python/pyspark/pandas/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -903,7 +903,7 @@ def isnull(self: IndexOpsLike) -> IndexOpsLike:
903903
904904
Examples
905905
--------
906-
>>> ser = ps.Series([5, 6, np.NaN])
906+
>>> ser = ps.Series([5, 6, np.nan])
907907
>>> ser.isna() # doctest: +NORMALIZE_WHITESPACE
908908
0 False
909909
1 False
@@ -939,7 +939,7 @@ def notnull(self: IndexOpsLike) -> IndexOpsLike:
939939
--------
940940
Show which entries in a Series are not NA.
941941
942-
>>> ser = ps.Series([5, 6, np.NaN])
942+
>>> ser = ps.Series([5, 6, np.nan])
943943
>>> ser
944944
0 5.0
945945
1 6.0

python/pyspark/pandas/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10064,7 +10064,7 @@ def reindex(
1006410064
number (0, 1).
1006510065
copy : bool, default True
1006610066
Return a new object, even if the passed indexes are the same.
10067-
fill_value : scalar, default np.NaN
10067+
fill_value : scalar, default np.nan
1006810068
Value to use for missing values. Defaults to NaN, but can be any
1006910069
"compatible" value.
1007010070

python/pyspark/pandas/indexes/base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2645,9 +2645,16 @@ def _test() -> None:
26452645
import sys
26462646
from pyspark.sql import SparkSession
26472647
import pyspark.pandas.indexes.base
2648+
from pandas.util.version import Version
26482649

26492650
os.chdir(os.environ["SPARK_HOME"])
26502651

2652+
if Version(np.__version__) >= Version("2"):
2653+
# Numpy 2.0+ changed its string format,
2654+
# adding type information to numeric scalars.
2655+
# `legacy="1.25"` only available in `nump>=2`
2656+
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]
2657+
26512658
globs = pyspark.pandas.indexes.base.__dict__.copy()
26522659
globs["ps"] = pyspark.pandas
26532660
spark = (

python/pyspark/pandas/indexing.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1833,9 +1833,16 @@ def _test() -> None:
18331833
import sys
18341834
from pyspark.sql import SparkSession
18351835
import pyspark.pandas.indexing
1836+
from pandas.util.version import Version
18361837

18371838
os.chdir(os.environ["SPARK_HOME"])
18381839

1840+
if Version(np.__version__) >= Version("2"):
1841+
# Numpy 2.0+ changed its string format,
1842+
# adding type information to numeric scalars.
1843+
# `legacy="1.25"` only available in `nump>=2`
1844+
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]
1845+
18391846
globs = pyspark.pandas.indexing.__dict__.copy()
18401847
globs["ps"] = pyspark.pandas
18411848
spark = (

python/pyspark/pandas/namespace.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2812,7 +2812,7 @@ def notna(obj):
28122812
--------
28132813
Show which entries in a DataFrame are not NA.
28142814
2815-
>>> df = ps.DataFrame({'age': [5, 6, np.NaN],
2815+
>>> df = ps.DataFrame({'age': [5, 6, np.nan],
28162816
... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),
28172817
... pd.Timestamp('1940-04-25')],
28182818
... 'name': ['Alfred', 'Batman', ''],
@@ -2831,7 +2831,7 @@ def notna(obj):
28312831
28322832
Show which entries in a Series are not NA.
28332833
2834-
>>> ser = ps.Series([5, 6, np.NaN])
2834+
>>> ser = ps.Series([5, 6, np.nan])
28352835
>>> ser
28362836
0 5.0
28372837
1 6.0
@@ -3731,9 +3731,16 @@ def _test() -> None:
37313731
import uuid
37323732
from pyspark.sql import SparkSession
37333733
import pyspark.pandas.namespace
3734+
from pandas.util.version import Version
37343735

37353736
os.chdir(os.environ["SPARK_HOME"])
37363737

3738+
if Version(np.__version__) >= Version("2"):
3739+
# Numpy 2.0+ changed its string format,
3740+
# adding type information to numeric scalars.
3741+
# `legacy="1.25"` only available in `nump>=2`
3742+
np.set_printoptions(legacy="1.25") # type: ignore[arg-type]
3743+
37373744
globs = pyspark.pandas.namespace.__dict__.copy()
37383745
globs["ps"] = pyspark.pandas
37393746
globs["sf"] = F

python/pyspark/pandas/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1893,7 +1893,7 @@ def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None)
18931893
index: array-like, optional
18941894
New labels / index to conform to, should be specified using keywords.
18951895
Preferably an Index object to avoid duplicating data
1896-
fill_value : scalar, default np.NaN
1896+
fill_value : scalar, default np.nan
18971897
Value to use for missing values. Defaults to NaN, but can be any
18981898
"compatible" value.
18991899

python/pyspark/pandas/strings.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -862,7 +862,7 @@ def contains(
862862
--------
863863
Returning a Series of booleans using only a literal pattern.
864864
865-
>>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
865+
>>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
866866
>>> s1.str.contains('og', regex=False)
867867
0 False
868868
1 True
@@ -965,7 +965,7 @@ def count(self, pat: str, flags: int = 0) -> "ps.Series":
965965
966966
Examples
967967
--------
968-
>>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.NaN, 'CABA', 'cat'])
968+
>>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
969969
>>> s.str.count('a')
970970
0 0.0
971971
1 0.0
@@ -1327,7 +1327,7 @@ def pandas_ljust(s) -> ps.Series[str]: # type: ignore[no-untyped-def]
13271327

13281328
return self._data.pandas_on_spark.transform_batch(pandas_ljust)
13291329

1330-
def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -> "ps.Series":
1330+
def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.nan) -> "ps.Series":
13311331
"""
13321332
Determine if each string matches a regular expression.
13331333
@@ -1353,7 +1353,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -
13531353
13541354
Examples
13551355
--------
1356-
>>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
1356+
>>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
13571357
>>> s.str.match('dog')
13581358
0 False
13591359
1 True

python/pyspark/pandas/tests/indexes/test_astype.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def test_astype(self):
5555
self.assert_eq(psidx.astype(bool), pidx.astype(bool))
5656
self.assert_eq(psidx.astype("bool"), pidx.astype("bool"))
5757
self.assert_eq(psidx.astype("?"), pidx.astype("?"))
58-
self.assert_eq(psidx.astype(np.unicode_), pidx.astype(np.unicode_))
58+
self.assert_eq(psidx.astype(np.str_), pidx.astype(np.str_))
5959
self.assert_eq(psidx.astype("str"), pidx.astype("str"))
6060
self.assert_eq(psidx.astype("U"), pidx.astype("U"))
6161

python/pyspark/pandas/tests/series/test_arg_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def test_argmin_argmax(self):
149149
self.assert_eq(pser.argmax(), psser.argmax())
150150
self.assert_eq(pser.argmax(skipna=False), psser.argmax(skipna=False))
151151

152-
pser2 = pd.Series([np.NaN, 1.0, 2.0, np.NaN])
152+
pser2 = pd.Series([np.nan, 1.0, 2.0, np.nan])
153153
psser2 = ps.from_pandas(pser2)
154154
self.assert_eq(pser2.argmin(), psser2.argmin())
155155
self.assert_eq(pser2.argmax(), psser2.argmax())

python/pyspark/pandas/tests/series/test_as_of.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def test_asof(self):
3232
self.assert_eq(psser.asof(20), pser.asof(20))
3333
self.assert_eq(psser.asof([5, 20]).sort_index(), pser.asof([5, 20]).sort_index())
3434
self.assert_eq(psser.asof(100), pser.asof(100))
35-
self.assert_eq(repr(psser.asof(-100)), repr(pser.asof(-100)))
35+
self.assert_eq(str(psser.asof(-100)), str(pser.asof(-100)))
3636
self.assert_eq(psser.asof([-100, 100]).sort_index(), pser.asof([-100, 100]).sort_index())
3737

3838
# where cannot be an Index, Series or a DataFrame
@@ -55,7 +55,7 @@ def test_asof(self):
5555

5656
self.assert_eq(psser.asof("2014-01-01"), pser.asof("2014-01-01"))
5757
self.assert_eq(psser.asof("2014-01-02"), pser.asof("2014-01-02"))
58-
self.assert_eq(repr(psser.asof("1999-01-02")), repr(pser.asof("1999-01-02")))
58+
self.assert_eq(str(psser.asof("1999-01-02")), str(pser.asof("1999-01-02")))
5959

6060
# SPARK-37482: Skip check monotonic increasing for Series.asof with 'compute.eager_check'
6161
pser = pd.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40])

python/pyspark/pandas/tests/series/test_as_type.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def _test_numeric_astype(self, pser):
126126
self.assert_eq(psser.astype(bool), pser.astype(bool))
127127
self.assert_eq(psser.astype("bool"), pser.astype("bool"))
128128
self.assert_eq(psser.astype("?"), pser.astype("?"))
129-
self.assert_eq(psser.astype(np.unicode_), pser.astype(np.unicode_))
129+
self.assert_eq(psser.astype(np.str_), pser.astype(np.str_))
130130
self.assert_eq(psser.astype("str"), pser.astype("str"))
131131
self.assert_eq(psser.astype("U"), pser.astype("U"))
132132

python/pyspark/pandas/tests/series/test_string_ops_adv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def pser(self):
3838
"\nleading-whitespace",
3939
"trailing-Whitespace \t",
4040
None,
41-
np.NaN,
41+
np.nan,
4242
]
4343
)
4444

python/pyspark/pandas/tests/series/test_string_ops_basic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def pser(self):
3838
"\nleading-whitespace",
3939
"trailing-Whitespace \t",
4040
None,
41-
np.NaN,
41+
np.nan,
4242
]
4343
)
4444

python/pyspark/pandas/tests/test_typedef.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,12 @@ def func() -> pd.Series[float]:
9393
self.assertEqual(inferred.dtype, np.float64)
9494
self.assertEqual(inferred.spark_type, DoubleType())
9595

96-
def func() -> "pd.DataFrame[np.float_, str]":
96+
def func() -> "pd.DataFrame[np.float64, str]":
9797
pass
9898

9999
expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
100100
inferred = infer_return_type(func)
101-
self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
101+
self.assertEqual(inferred.dtypes, [np.float64, np.str_])
102102
self.assertEqual(inferred.spark_type, expected)
103103

104104
def func() -> "pandas.DataFrame[float]":
@@ -121,10 +121,10 @@ def func() -> pd.DataFrame[np.float64, str]:
121121

122122
expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
123123
inferred = infer_return_type(func)
124-
self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
124+
self.assertEqual(inferred.dtypes, [np.float64, np.str_])
125125
self.assertEqual(inferred.spark_type, expected)
126126

127-
def func() -> pd.DataFrame[np.float_]:
127+
def func() -> pd.DataFrame[np.float64]:
128128
pass
129129

130130
expected = StructType([StructField("c0", DoubleType())])
@@ -167,12 +167,12 @@ def test_if_pandas_implements_class_getitem(self):
167167
assert not ps._series_has_class_getitem
168168

169169
def test_infer_schema_with_names_pandas_instances(self):
170-
def func() -> 'pd.DataFrame["a" : np.float_, "b":str]': # noqa: F405
170+
def func() -> 'pd.DataFrame["a" : np.float64, "b":str]': # noqa: F405
171171
pass
172172

173173
expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())])
174174
inferred = infer_return_type(func)
175-
self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
175+
self.assertEqual(inferred.dtypes, [np.float64, np.str_])
176176
self.assertEqual(inferred.spark_type, expected)
177177

178178
def func() -> "pd.DataFrame['a': float, 'b': int]": # noqa: F405
@@ -217,7 +217,7 @@ def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
217217

218218
def test_infer_schema_with_names_pandas_instances_negative(self):
219219
def try_infer_return_type():
220-
def f() -> 'pd.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405
220+
def f() -> 'pd.DataFrame["a" : np.float64 : 1, "b":str:2]': # noqa: F405
221221
pass
222222

223223
infer_return_type(f)
@@ -283,7 +283,7 @@ def f() -> ps.DataFrame[A]:
283283
self.assertRaisesRegex(TypeError, "not understood", try_infer_return_type)
284284

285285
def try_infer_return_type():
286-
def f() -> 'ps.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405
286+
def f() -> 'ps.DataFrame["a" : np.float64 : 1, "b":str:2]': # noqa: F405
287287
pass
288288

289289
infer_return_type(f)
@@ -314,7 +314,6 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
314314
# binary
315315
np.character: (np.character, BinaryType()),
316316
np.bytes_: (np.bytes_, BinaryType()),
317-
np.string_: (np.bytes_, BinaryType()),
318317
bytes: (np.bytes_, BinaryType()),
319318
# integer
320319
np.int8: (np.int8, ByteType()),
@@ -328,8 +327,8 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
328327
np.float64: (np.float64, DoubleType()),
329328
float: (np.float64, DoubleType()),
330329
# string
331-
np.unicode_: (np.unicode_, StringType()),
332-
str: (np.unicode_, StringType()),
330+
np.str_: (np.str_, StringType()),
331+
str: (np.str_, StringType()),
333332
# bool
334333
bool: (np.bool_, BooleanType()),
335334
# datetime

python/pyspark/pandas/typedef/typehints.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def as_spark_type(
176176
return None
177177
return types.ArrayType(element_type)
178178
# BinaryType
179-
elif tpe in (bytes, np.character, np.bytes_, np.string_):
179+
elif tpe in (bytes, np.character, np.bytes_):
180180
return types.BinaryType()
181181
# BooleanType
182182
elif tpe in (bool, np.bool_, "bool", "?"):
@@ -190,7 +190,7 @@ def as_spark_type(
190190
elif tpe in (decimal.Decimal,):
191191
# TODO: considering the precision & scale for decimal type.
192192
return types.DecimalType(38, 18)
193-
elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
193+
elif tpe in (float, np.double, np.float64, "float", "float64", "double"):
194194
return types.DoubleType()
195195
elif tpe in (np.float32, "float32", "f"):
196196
return types.FloatType()
@@ -201,7 +201,7 @@ def as_spark_type(
201201
elif tpe in (np.int16, "int16", "short"):
202202
return types.ShortType()
203203
# StringType
204-
elif tpe in (str, np.unicode_, "str", "U"):
204+
elif tpe in (str, np.str_, "str", "U"):
205205
return types.StringType()
206206
# TimestampType or TimestampNTZType if timezone is not specified.
207207
elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M", pd.Timestamp):

python/pyspark/sql/tests/test_arrow_python_udf.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def test_complex_input_types(self):
6060
.first()
6161
)
6262

63-
self.assertEqual(row[0], "[1, 2, 3]")
63+
self.assertIn(row[0], ["[1, 2, 3]", "[np.int32(1), np.int32(2), np.int32(3)]"])
6464
self.assertEqual(row[1], "{'a': 'b'}")
6565
self.assertEqual(row[2], "Row(col1=1, col2=2)")
6666

@@ -119,9 +119,10 @@ def test_register(self):
119119
str_repr_func = self.spark.udf.register("str_repr", udf(lambda x: str(x), useArrow=True))
120120

121121
# To verify that Arrow optimization is on
122-
self.assertEqual(
122+
self.assertIn(
123123
df.selectExpr("str_repr(array) AS str_id").first()[0],
124-
"[1, 2, 3]", # The input is a NumPy array when the Arrow optimization is on
124+
["[1, 2, 3]", "[np.int32(1), np.int32(2), np.int32(3)]"],
125+
# The input is a NumPy array when the Arrow optimization is on
125126
)
126127

127128
# To verify that a UserDefinedFunction is returned
@@ -132,11 +133,14 @@ def test_register(self):
132133

133134
def test_nested_array_input(self):
134135
df = self.spark.range(1).selectExpr("array(array(1, 2), array(3, 4)) as nested_array")
135-
self.assertEqual(
136+
self.assertIn(
136137
df.select(
137138
udf(lambda x: str(x), returnType="string", useArrow=True)("nested_array")
138139
).first()[0],
139-
"[[1, 2], [3, 4]]",
140+
[
141+
"[[1, 2], [3, 4]]",
142+
"[[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]]",
143+
],
140144
)
141145

142146
def test_type_coercion_string_to_numeric(self):

python/pyspark/sql/tests/test_udf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -904,7 +904,9 @@ def test_nested_array(self):
904904
df = self.spark.range(1).selectExpr("array(array(1, 2), array(3, 4)) as nested_array")
905905
# Input
906906
row = df.select(udf(lambda x: str(x))("nested_array")).first()
907-
self.assertEqual(row[0], "[[1, 2], [3, 4]]")
907+
self.assertIn(
908+
row[0], ["[[1, 2], [3, 4]]", "[[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]]"]
909+
)
908910
# Output
909911

910912
@udf(returnType=df.dtypes[0][1])

0 commit comments

Comments
 (0)