Skip to content

BUG: Index with null value not serialized correctly to json #50400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,8 @@ I/O
- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
- Bug in :meth:`DataFrame.to_json` where it would incorrectly use the string representations of NA-values instead of null when serializing an index (:issue:`31801`)
- Bug in :meth:`DataFrame.to_json` where it would error when serializing ``Decimal("NaN")`` (:issue:`50400`)

Period
^^^^^^
Expand Down
78 changes: 63 additions & 15 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,27 @@ static int is_simple_frame(PyObject *obj) {
Py_DECREF(mgr);
return ret;
}
/* TODO: Consider unifying with checknull and co.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a quick note in the docstring that this returns -1 on error?

in missing.pyx */
static int is_null_obj(PyObject* obj) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit but I think this should be called is_json_null since its semantics may vary from what we have elsewhere in the codebase

int is_null = 0;
if (PyFloat_Check(obj)) {
double fval = PyFloat_AS_DOUBLE(obj);
is_null = npy_isnan(fval);
} else if (obj == Py_None || object_is_na_type(obj)) {
is_null = 1;
} else if (object_is_decimal_type(obj)) {
PyObject *is_null_obj = PyObject_CallMethod(obj,
"is_nan",
NULL);
is_null = (is_null_obj == Py_True);
if (!is_null_obj) {
return -1;
}
Py_DECREF(is_null_obj);
}
return is_null;
}

static npy_int64 get_long_attr(PyObject *o, const char *attr) {
// NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT
Expand Down Expand Up @@ -1283,6 +1304,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
type_num = PyArray_TYPE(labels);

for (i = 0; i < num; i++) {
int is_null = 0; // Whether current val is a null
item = PyArray_GETITEM(labels, dataptr);
if (!item) {
NpyArr_freeLabels(ret, num);
Expand Down Expand Up @@ -1320,9 +1342,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,

if (is_datetimelike) {
if (nanosecVal == get_nat()) {
len = 4;
cLabel = PyObject_Malloc(len + 1);
strncpy(cLabel, "null", len + 1);
is_null = 1;
} else {
if (enc->datetimeIso) {
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
Expand All @@ -1348,25 +1368,41 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
len = strlen(cLabel);
}
}
} else { // Fallback to string representation
// Replace item with the string to keep it alive.
Py_SETREF(item, PyObject_Str(item));
if (item == NULL) {
NpyArr_freeLabels(ret, num);
ret = 0;
break;
} else {
// NA values need special handling
is_null = is_null_obj(item);
if (is_null == -1) {
// Something errored
// Return to let the error surface
return 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surprised we return 0 here but I see that you are just matching the pattern of the rest of the function. It should really be returning NULL in case of an error - looks to not be handled properly. But of course that is separate from this PR

}
if (!is_null) {
// Otherwise, fallback to string representation
// Replace item with the string to keep it alive.
Py_SETREF(item, PyObject_Str(item));
if (item == NULL) {
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

cLabel = (char *)PyUnicode_AsUTF8(item);
len = strlen(cLabel);
}
}

cLabel = (char *)PyUnicode_AsUTF8(item);
len = strlen(cLabel);
if (is_null) {
len = 4;
cLabel = PyObject_Malloc(len + 1);
strncpy(cLabel, "null", len + 1);
}

// Add 1 to include NULL terminator
ret[i] = PyObject_Malloc(len + 1);
memcpy(ret[i], cLabel, len + 1);
Py_DECREF(item);

if (is_datetimelike) {
if (is_datetimelike || is_null) {
PyObject_Free(cLabel);
}

Expand Down Expand Up @@ -1512,8 +1548,20 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
tc->type = JT_UTF8;
return;
} else if (object_is_decimal_type(obj)) {
GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj);
tc->type = JT_DOUBLE;
/* Check for null, since null can't go thru double path */
PyObject *is_null_obj = PyObject_CallMethod(obj,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this not be replaced with the function you are introducing? Seems like it should work to keep logic consistent?

"is_nan",
NULL);
if (!is_null_obj) {
goto INVALID;
}
if (is_null_obj == Py_False) {
GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj);
tc->type = JT_DOUBLE;
} else {
tc->type = JT_NULL;
}
Py_DECREF(is_null_obj);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a case where PyObject_CallMethod returns NULL this would try to Py_DECREF NULL

return;
} else if (PyDateTime_Check(obj) || PyDate_Check(obj)) {
if (object_is_nat_type(obj)) {
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import datetime
from datetime import timedelta
from decimal import Decimal
from io import StringIO
import json
import os
Expand Down Expand Up @@ -1766,15 +1765,16 @@ def test_to_s3(self, s3_resource, s3so):
timeout -= 0.1
assert timeout > 0, "Timed out waiting for file to appear on moto"

def test_json_pandas_nulls(self, nulls_fixture, request):
def test_json_pandas_nulls(self, nulls_fixture):
# GH 31615
if isinstance(nulls_fixture, Decimal):
mark = pytest.mark.xfail(reason="not implemented")
request.node.add_marker(mark)

result = DataFrame([[nulls_fixture]]).to_json()
assert result == '{"0":{"0":null}}'

def test_json_pandas_index_nulls(self, nulls_fixture):
# GH 31801
result = Series([1], index=[nulls_fixture]).to_json()
assert result == '{"null":1}'

def test_readjson_bool_series(self):
# GH31464
result = read_json("[true, true, false]", typ="series")
Expand Down