Code review update: set mask to None if the array to serialize is an arrow array

nicolasazrak · nicolasazrak · commit 944a515e1909 · 2021-12-04T14:56:27.000-03:00
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -160,7 +160,10 @@ def _create_batch(self, series):
         series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)
 
         def create_array(s, t):
-            mask = s.isnull()
+            if hasattr(s.values, '__arrow_array__'):
+                mask = None
+            else:
+                mask = s.isnull()
             # Ensure timestamp series are in expected form for Spark internal representation
             if t is not None and pa.types.is_timestamp(t) and t.tz is not None:
                 s = _check_series_convert_timestamps_internal(s, self._timezone)
@@ -169,8 +172,6 @@ def create_array(s, t):
             elif is_categorical_dtype(s.dtype):
                 # Note: This can be removed once minimum pyarrow version is >= 0.16.1
                 s = s.astype(s.dtypes.categories.dtype)
-            elif t is not None and pa.types.is_string(t):
-                s = s.astype(str)
             try:
                 array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck)
             except ValueError as e:
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -425,6 +425,13 @@ def test_createDataFrame_with_string_dtype(self):
             # Changing that to use a StringArray would be backwards incompatible.
             assert_frame_equal(pandas_df, df.toPandas(), check_dtype=False)
 
+    def test_createDataFrame_with_int64(self):
+        # SPARK-34521: spark.createDataFrame does not support Pandas StringDtype extension type
+        with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": True}):
+            pandas_df = pd.DataFrame({"col": [1, 2, 3, None]}, dtype="Int64")
+            df = self.spark.createDataFrame(pandas_df)
+            assert_frame_equal(pandas_df, df.toPandas(), check_dtype=False)
+
     def test_toPandas_with_map_type(self):
         pdf = pd.DataFrame({"id": [0, 1, 2, 3],
                             "m": [{}, {"a": 1}, {"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}]})