Merge remote-tracking branch 'upstream/main' into doc-date-parser

Shashwat · Shashwat · commit 96594f1a6d0e · 2022-12-18T19:52:12.000+05:30
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -38,6 +38,8 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_csv`
 * :func:`read_excel`
 * :func:`read_sql`
+* :func:`read_sql_query`
+* :func:`read_sql_table`
 
 Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
 to select the nullable dtypes implementation.
@@ -394,7 +396,7 @@ If installed, we now require:
 +-----------------+-----------------+----------+---------+
 | Package         | Minimum Version | Required | Changed |
 +=================+=================+==========+=========+
-| mypy (dev)      | 0.990           |          |    X    |
+| mypy (dev)      | 0.991           |          |    X    |
 +-----------------+-----------------+----------+---------+
 | python-dateutil | 2.8.2           |    X     |    X    |
 +-----------------+-----------------+----------+---------+
@@ -880,7 +882,7 @@ I/O
 - Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
 - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
 - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
--
+- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
 
 Period
 ^^^^^^
diff --git a/environment.yml b/environment.yml
@@ -80,7 +80,7 @@ dependencies:
   - flake8=6.0.0
   - flake8-bugbear=22.7.1 # used by flake8, find likely bugs
   - isort>=5.2.1  # check that imports are in the right order
-  - mypy=0.990
+  - mypy=0.991
   - pre-commit>=2.15.0
   - pycodestyle  # used by flake8
   - pyupgrade
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -332,9 +332,18 @@ static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
     return PyBytes_AS_STRING(obj);
 }
 
-static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
+static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
                              size_t *_outLen) {
-    return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
+    char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj,
+                                                    (Py_ssize_t *)_outLen);
+    if (encoded == NULL) {
+        /* Something went wrong.
+          Set errorMsg(to tell encoder to stop),
+          and let Python exception propagate. */
+        JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
+        enc->errorMsg = "Encoding failed.";
+    }
+    return encoded;
 }
 
 /* JSON callback. returns a char* and mutates the pointer to *len */
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -224,6 +224,7 @@ def read_sql_table(
     parse_dates: list[str] | dict[str, str] | None = ...,
     columns: list[str] | None = ...,
     chunksize: None = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> DataFrame:
     ...
 
@@ -238,6 +239,7 @@ def read_sql_table(
     parse_dates: list[str] | dict[str, str] | None = ...,
     columns: list[str] | None = ...,
     chunksize: int = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> Iterator[DataFrame]:
     ...
 
@@ -251,6 +253,7 @@ def read_sql_table(
     parse_dates: list[str] | dict[str, str] | None = None,
     columns: list[str] | None = None,
     chunksize: int | None = None,
+    use_nullable_dtypes: bool = False,
 ) -> DataFrame | Iterator[DataFrame]:
     """
     Read SQL database table into a DataFrame.
@@ -287,6 +290,12 @@ def read_sql_table(
     chunksize : int, default None
         If specified, returns an iterator where `chunksize` is the number of
         rows to include in each chunk.
+    use_nullable_dtypes : bool = False
+        Whether to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        .. versionadded:: 2.0
 
     Returns
     -------
@@ -318,6 +327,7 @@ def read_sql_table(
             parse_dates=parse_dates,
             columns=columns,
             chunksize=chunksize,
+            use_nullable_dtypes=use_nullable_dtypes,
         )
 
     if table is not None:
@@ -336,6 +346,7 @@ def read_sql_query(
     parse_dates: list[str] | dict[str, str] | None = ...,
     chunksize: None = ...,
     dtype: DtypeArg | None = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> DataFrame:
     ...
 
@@ -350,6 +361,7 @@ def read_sql_query(
     parse_dates: list[str] | dict[str, str] | None = ...,
     chunksize: int = ...,
     dtype: DtypeArg | None = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> Iterator[DataFrame]:
     ...
 
@@ -363,6 +375,7 @@ def read_sql_query(
     parse_dates: list[str] | dict[str, str] | None = None,
     chunksize: int | None = None,
     dtype: DtypeArg | None = None,
+    use_nullable_dtypes: bool = False,
 ) -> DataFrame | Iterator[DataFrame]:
     """
     Read SQL query into a DataFrame.
@@ -406,6 +419,12 @@ def read_sql_query(
         {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}.
 
         .. versionadded:: 1.3.0
+    use_nullable_dtypes : bool = False
+        Whether to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        .. versionadded:: 2.0
 
     Returns
     -------
@@ -430,6 +449,7 @@ def read_sql_query(
             parse_dates=parse_dates,
             chunksize=chunksize,
             dtype=dtype,
+            use_nullable_dtypes=use_nullable_dtypes,
         )
 
 
diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
@@ -291,6 +291,15 @@ def test_encode_unicode_4bytes_utf8highest(self):
         assert enc == json.dumps(four_bytes_input)
         assert dec == json.loads(enc)
 
+    def test_encode_unicode_error(self):
+        string = "'\udac0'"
+        msg = (
+            r"'utf-8' codec can't encode character '\\udac0' "
+            r"in position 1: surrogates not allowed"
+        )
+        with pytest.raises(UnicodeEncodeError, match=msg):
+            ujson.dumps([string])
+
     def test_encode_array_in_array(self):
         arr_in_arr_input = [[[[]]]]
         output = ujson.encode(arr_in_arr_input)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -2276,21 +2276,22 @@ def test_get_engine_auto_error_message(self):
         pass
         # TODO(GH#36893) fill this in when we add more engines
 
-    def test_read_sql_nullable_dtypes(self, string_storage):
+    @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"])
+    def test_read_sql_nullable_dtypes(self, string_storage, func):
         # GH#50048
         table = "test"
         df = self.nullable_data()
         df.to_sql(table, self.conn, index=False, if_exists="replace")
 
         with pd.option_context("mode.string_storage", string_storage):
-            result = pd.read_sql(
+            result = getattr(pd, func)(
                 f"Select * from {table}", self.conn, use_nullable_dtypes=True
             )
         expected = self.nullable_expected(string_storage)
         tm.assert_frame_equal(result, expected)
 
         with pd.option_context("mode.string_storage", string_storage):
-            iterator = pd.read_sql(
+            iterator = getattr(pd, func)(
                 f"Select * from {table}",
                 self.conn,
                 use_nullable_dtypes=True,
@@ -2300,20 +2301,21 @@ def test_read_sql_nullable_dtypes(self, string_storage):
             for result in iterator:
                 tm.assert_frame_equal(result, expected)
 
-    def test_read_sql_nullable_dtypes_table(self, string_storage):
+    @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"])
+    def test_read_sql_nullable_dtypes_table(self, string_storage, func):
         # GH#50048
         table = "test"
         df = self.nullable_data()
         df.to_sql(table, self.conn, index=False, if_exists="replace")
 
         with pd.option_context("mode.string_storage", string_storage):
-            result = pd.read_sql(table, self.conn, use_nullable_dtypes=True)
+            result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True)
         expected = self.nullable_expected(string_storage)
         tm.assert_frame_equal(result, expected)
 
         with pd.option_context("mode.string_storage", string_storage):
-            iterator = pd.read_sql(
-                f"Select * from {table}",
+            iterator = getattr(pd, func)(
+                table,
                 self.conn,
                 use_nullable_dtypes=True,
                 chunksize=3,
@@ -2463,7 +2465,8 @@ class Test(BaseModel):
     def nullable_expected(self, storage) -> DataFrame:
         return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"})
 
-    def test_read_sql_nullable_dtypes_table(self, string_storage):
+    @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"])
+    def test_read_sql_nullable_dtypes_table(self, string_storage, func):
         # GH#50048 Not supported for sqlite
         pass
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -57,7 +57,7 @@ cpplint
 flake8==6.0.0
 flake8-bugbear==22.7.1
 isort>=5.2.1
-mypy==0.990
+mypy==0.991
 pre-commit>=2.15.0
 pycodestyle
 pyupgrade