Skip to content

BUG: Avoids b' prefix for bytes in to_csv() (#9712) #35004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,7 @@ I/O
- Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`)
- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`)
- Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`)
- Bug in :meth:`to_csv` which emitted b'' around bytes (:issue:`9712`)

Plotting
^^^^^^^^
Expand Down
30 changes: 26 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,17 @@ cdef class Validator:
else:
return False

cdef bint any(self, ndarray values) except -1:
if not self.n:
return False
cdef:
Py_ssize_t i
Py_ssize_t n = self.n
for i in range(n):
if self.is_valid(values[i]):
return True
return False

@cython.wraparound(False)
@cython.boundscheck(False)
cdef bint _validate(self, ndarray values) except -1:
Expand Down Expand Up @@ -1709,13 +1720,24 @@ cdef class BytesValidator(Validator):
cdef inline bint is_array_typed(self) except -1:
return issubclass(self.dtype.type, np.bytes_)


cdef bint is_bytes_array(ndarray values, bint skipna=False):
cpdef bint is_bytes_array(ndarray values, bint skipna=False,
bint mixing_allowed=True) except -1:
"""Checks if all the values are bytes or not. When mixing_allowed is false and
some are bytes and some are not, then throws a ValueError."""
cdef:
BytesValidator validator = BytesValidator(len(values), values.dtype,
skipna=skipna)
return validator.validate(values)

is_all_bytes = validator.validate(values)
if mixing_allowed:
return is_all_bytes
else:
if is_all_bytes:
return True
else:
is_any_bytes = validator.any(values)
if is_any_bytes:
raise ValueError("Cannot mix types")
return False

cdef class TemporalValidator(Validator):
cdef:
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
from pandas.core.ops import get_op_result_name
from pandas.core.ops.invalid import make_invalid_op
from pandas.core.sorting import ensure_key_mapped
from pandas.core.strings import StringMethods
from pandas.core.strings import StringMethods, str_decode

from pandas.io.formats.printing import (
PrettyDict,
Expand Down Expand Up @@ -954,6 +954,8 @@ def to_native_types(self, slicer=None, **kwargs):
Whether or not there are quoted values in `self`
3) date_format : str
The format used to represent date-like values.
4) bytes_encoding : str
The encoding scheme to use to decode the bytes.

Returns
-------
Expand All @@ -965,7 +967,9 @@ def to_native_types(self, slicer=None, **kwargs):
values = values[slicer]
return values._format_native_types(**kwargs)

def _format_native_types(self, na_rep="", quoting=None, **kwargs):
def _format_native_types(
self, na_rep="", quoting=None, bytes_encoding=None, **kwargs
):
"""
Actually format specific types of the index.
"""
Expand All @@ -976,6 +980,8 @@ def _format_native_types(self, na_rep="", quoting=None, **kwargs):
values = np.array(self, dtype=object, copy=True)

values[mask] = na_rep
if lib.is_bytes_array(values, skipna=True, mixing_allowed=False):
values = str_decode(values, bytes_encoding)
return values

def _summary(self, name=None) -> str_t:
Expand Down
10 changes: 9 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
)
import pandas.core.missing as missing
from pandas.core.nanops import nanpercentile
from pandas.core.strings import str_decode

if TYPE_CHECKING:
from pandas import Index
Expand Down Expand Up @@ -653,13 +654,20 @@ def should_store(self, value: ArrayLike) -> bool:
"""
return is_dtype_equal(value.dtype, self.dtype)

def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
def to_native_types(
self, na_rep="nan", bytes_encoding=None, quoting=None, **kwargs
):
""" convert to our native types format """
values = self.values

mask = isna(values)
itemsize = writers.word_len(na_rep)

length = values.shape[0]
for i in range(length):
if lib.is_bytes_array(values[i], skipna=True, mixing_allowed=False):
values[i] = str_decode(values[i], bytes_encoding)

if not self.is_object and not quoting and itemsize:
values = values.astype(str)
if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
Expand Down
17 changes: 16 additions & 1 deletion pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import numpy as np

from pandas._libs import writers as libwriters
from pandas._libs import lib, writers as libwriters
from pandas._typing import FilePathOrBuffer

from pandas.core.dtypes.generic import (
Expand Down Expand Up @@ -108,6 +108,7 @@ def __init__(
if isinstance(cols, ABCIndexClass):
cols = cols.to_native_types(
na_rep=na_rep,
bytes_encoding=self.encoding,
float_format=float_format,
date_format=date_format,
quoting=self.quoting,
Expand All @@ -122,6 +123,7 @@ def __init__(
if isinstance(cols, ABCIndexClass):
cols = cols.to_native_types(
na_rep=na_rep,
bytes_encoding=self.encoding,
float_format=float_format,
date_format=date_format,
quoting=self.quoting,
Expand Down Expand Up @@ -278,6 +280,8 @@ def _save_header(self):
else:
encoded_labels = []

self._bytes_to_str(encoded_labels)

if not has_mi_columns or has_aliases:
encoded_labels += list(write_cols)
writer.writerow(encoded_labels)
Expand All @@ -300,6 +304,7 @@ def _save_header(self):
col_line.extend([""] * (len(index_label) - 1))

col_line.extend(columns._get_level_values(i))
self._bytes_to_str(col_line)

writer.writerow(col_line)

Expand Down Expand Up @@ -340,6 +345,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
b = blocks[i]
d = b.to_native_types(
na_rep=self.na_rep,
bytes_encoding=self.encoding,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
Expand All @@ -353,10 +359,19 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
ix = data_index.to_native_types(
slicer=slicer,
na_rep=self.na_rep,
bytes_encoding=self.encoding,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting,
)

libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)

def _bytes_to_str(self, values):
"""If all the values are bytes, then modify values list by decoding
bytes to str."""
np_values = np.array(values, dtype=object)
if lib.is_bytes_array(np_values, skipna=True, mixing_allowed=False):
for i, value in enumerate(values):
values[i] = value.decode(self.encoding)
83 changes: 83 additions & 0 deletions pandas/tests/frame/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,89 @@ def test_to_csv_withcommas(self):
df2 = self.read_csv(path)
tm.assert_frame_equal(df2, df)

def test_to_csv_bytes(self):
# GH 9712
times = date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H")
df = DataFrame({b"foo": [b"bar", b"baz"], b"times": times}, index=[b"A", b"B"])
df.loc[b"C"] = np.nan
df.index.name = b"idx"

df_expected = DataFrame(
{"foo": ["bar", "baz"], "times": times}, index=["A", "B"]
)
df_expected.loc["C"] = np.nan
df_expected.index.name = "idx"

with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
df.to_csv(path, header=True)
df_output = self.read_csv(path)
df_output.times = to_datetime(df_output.times)
tm.assert_frame_equal(df_output, df_expected)

non_unicode_byte = b"\xbc\xa6"
non_unicode_decoded = non_unicode_byte.decode("gb18030")
df = DataFrame({non_unicode_byte: [non_unicode_byte, b"foo"]})
df.index.name = "idx"

df_expected = DataFrame({non_unicode_decoded: [non_unicode_decoded, "foo"]})
df_expected.index.name = "idx"

with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
df.to_csv(path, encoding="gb18030", header=True)
df_output = self.read_csv(path, encoding="gb18030")
tm.assert_frame_equal(df_output, df_expected)

# decoding error, when transcoding fails
with pytest.raises(UnicodeDecodeError):
df.to_csv(encoding="utf-8")

# mixing of bytes and non-bytes
df = DataFrame({"foo": [b"bar", "baz"]})
with pytest.raises(ValueError):
df.to_csv()
df = DataFrame({b"foo": ["a", "b"], "bar": ["c", "d"]})
with pytest.raises(ValueError):
df.to_csv()
df = DataFrame({"foo": ["a", "b"], "bar": ["c", "d"]}, index=["A", b"B"])
with pytest.raises(ValueError):
df.to_csv()

# multi-indexes
iterables = [[b"A", b"B"], ["C", "D"]]
index = pd.MultiIndex.from_product(iterables, names=[b"f", b"s"])
data = np.array([[0, 0], [0, 0], [0, 0], [0, 0]])
df = pd.DataFrame(data, index=index)

with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
df.to_csv(path)
import sys

df.to_csv(sys.stdout)
with open(path) as csvfile:
output = csvfile.readlines()

expected = [
"f,s,0,1\n",
"A,C,0,0\n",
"A,D,0,0\n",
"B,C,0,0\n",
"B,D,0,0\n",
]
assert output == expected

# mixing of bytes and non-bytes in multi-indexes
iterables = [[b"A", "B"], ["C", "D"]]
index = pd.MultiIndex.from_product(iterables)
df = pd.DataFrame(data, index=index)
with pytest.raises(ValueError):
df.to_csv()

iterables = [["A", "B"], ["C", "D"]]
index = pd.MultiIndex.from_product(iterables, names=[b"f", "s"])
df = pd.DataFrame(data, index=index)
with pytest.raises(ValueError):
df.to_csv()

def test_to_csv_mixed(self):
def create_cols(name):
return [f"{name}{i:03d}" for i in range(5)]
Expand Down