diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 75f406d908c73..292e8bafc0b86 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1030,6 +1030,7 @@ I/O - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) - Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) +- Bug in :meth:`to_csv` which emitted b'' around bytes (:issue:`9712`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ea97bab2198eb..8346987511fc6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1558,6 +1558,17 @@ cdef class Validator: else: return False + cdef bint any(self, ndarray values) except -1: + if not self.n: + return False + cdef: + Py_ssize_t i + Py_ssize_t n = self.n + for i in range(n): + if self.is_valid(values[i]): + return True + return False + @cython.wraparound(False) @cython.boundscheck(False) cdef bint _validate(self, ndarray values) except -1: @@ -1709,13 +1720,24 @@ cdef class BytesValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.bytes_) - -cdef bint is_bytes_array(ndarray values, bint skipna=False): +cpdef bint is_bytes_array(ndarray values, bint skipna=False, + bint mixing_allowed=True) except -1: + """Checks if all the values are bytes or not. When mixing_allowed is false and + some are bytes and some are not, then throws a ValueError.""" cdef: BytesValidator validator = BytesValidator(len(values), values.dtype, skipna=skipna) - return validator.validate(values) - + is_all_bytes = validator.validate(values) + if mixing_allowed: + return is_all_bytes + else: + if is_all_bytes: + return True + else: + is_any_bytes = validator.any(values) + if is_any_bytes: + raise ValueError("Cannot mix types") + return False cdef class TemporalValidator(Validator): cdef: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b12a556a8291d..feb327b95972a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -77,7 +77,7 @@ from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ensure_key_mapped -from pandas.core.strings import StringMethods +from pandas.core.strings import StringMethods, str_decode from pandas.io.formats.printing import ( PrettyDict, @@ -954,6 +954,8 @@ def to_native_types(self, slicer=None, **kwargs): Whether or not there are quoted values in `self` 3) date_format : str The format used to represent date-like values. + 4) bytes_encoding : str + The encoding scheme to use to decode the bytes. Returns ------- @@ -965,7 +967,9 @@ def to_native_types(self, slicer=None, **kwargs): values = values[slicer] return values._format_native_types(**kwargs) - def _format_native_types(self, na_rep="", quoting=None, **kwargs): + def _format_native_types( + self, na_rep="", quoting=None, bytes_encoding=None, **kwargs + ): """ Actually format specific types of the index. """ @@ -976,6 +980,8 @@ def _format_native_types(self, na_rep="", quoting=None, **kwargs): values = np.array(self, dtype=object, copy=True) values[mask] = na_rep + if lib.is_bytes_array(values, skipna=True, mixing_allowed=False): + values = str_decode(values, bytes_encoding) return values def _summary(self, name=None) -> str_t: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6207785fb2975..217c864918367 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -83,6 +83,7 @@ ) import pandas.core.missing as missing from pandas.core.nanops import nanpercentile +from pandas.core.strings import str_decode if TYPE_CHECKING: from pandas import Index @@ -653,13 +654,20 @@ def should_store(self, value: ArrayLike) -> bool: """ return is_dtype_equal(value.dtype, self.dtype) - def to_native_types(self, na_rep="nan", quoting=None, **kwargs): + def to_native_types( + self, na_rep="nan", bytes_encoding=None, quoting=None, **kwargs + ): """ convert to our native types format """ values = self.values mask = isna(values) itemsize = writers.word_len(na_rep) + length = values.shape[0] + for i in range(length): + if lib.is_bytes_array(values[i], skipna=True, mixing_allowed=False): + values[i] = str_decode(values[i], bytes_encoding) + if not self.is_object and not quoting and itemsize: values = values.astype(str) if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5bd51dc8351f6..67c5d57d19080 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,7 +11,7 @@ import numpy as np -from pandas._libs import writers as libwriters +from pandas._libs import lib, writers as libwriters from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.generic import ( @@ -108,6 +108,7 @@ def __init__( if isinstance(cols, ABCIndexClass): cols = cols.to_native_types( na_rep=na_rep, + bytes_encoding=self.encoding, float_format=float_format, date_format=date_format, quoting=self.quoting, @@ -122,6 +123,7 @@ def __init__( if isinstance(cols, ABCIndexClass): cols = cols.to_native_types( na_rep=na_rep, + bytes_encoding=self.encoding, float_format=float_format, date_format=date_format, quoting=self.quoting, @@ -278,6 +280,8 @@ def _save_header(self): else: encoded_labels = [] + self._bytes_to_str(encoded_labels) + if not has_mi_columns or has_aliases: encoded_labels += list(write_cols) writer.writerow(encoded_labels) @@ -300,6 +304,7 @@ def _save_header(self): col_line.extend([""] * (len(index_label) - 1)) col_line.extend(columns._get_level_values(i)) + self._bytes_to_str(col_line) writer.writerow(col_line) @@ -340,6 +345,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: b = blocks[i] d = b.to_native_types( na_rep=self.na_rep, + bytes_encoding=self.encoding, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, @@ -353,6 +359,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: ix = data_index.to_native_types( slicer=slicer, na_rep=self.na_rep, + bytes_encoding=self.encoding, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, @@ -360,3 +367,11 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: ) libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + + def _bytes_to_str(self, values): + """If all the values are bytes, then modify values list by decoding + bytes to str.""" + np_values = np.array(values, dtype=object) + if lib.is_bytes_array(np_values, skipna=True, mixing_allowed=False): + for i, value in enumerate(values): + values[i] = value.decode(self.encoding) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 2b7b3af8f4705..d9c35dba15586 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -740,6 +740,89 @@ def test_to_csv_withcommas(self): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) + def test_to_csv_bytes(self): + # GH 9712 + times = date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H") + df = DataFrame({b"foo": [b"bar", b"baz"], b"times": times}, index=[b"A", b"B"]) + df.loc[b"C"] = np.nan + df.index.name = b"idx" + + df_expected = DataFrame( + {"foo": ["bar", "baz"], "times": times}, index=["A", "B"] + ) + df_expected.loc["C"] = np.nan + df_expected.index.name = "idx" + + with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path: + df.to_csv(path, header=True) + df_output = self.read_csv(path) + df_output.times = to_datetime(df_output.times) + tm.assert_frame_equal(df_output, df_expected) + + non_unicode_byte = b"\xbc\xa6" + non_unicode_decoded = non_unicode_byte.decode("gb18030") + df = DataFrame({non_unicode_byte: [non_unicode_byte, b"foo"]}) + df.index.name = "idx" + + df_expected = DataFrame({non_unicode_decoded: [non_unicode_decoded, "foo"]}) + df_expected.index.name = "idx" + + with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path: + df.to_csv(path, encoding="gb18030", header=True) + df_output = self.read_csv(path, encoding="gb18030") + tm.assert_frame_equal(df_output, df_expected) + + # decoding error, when transcoding fails + with pytest.raises(UnicodeDecodeError): + df.to_csv(encoding="utf-8") + + # mixing of bytes and non-bytes + df = DataFrame({"foo": [b"bar", "baz"]}) + with pytest.raises(ValueError): + df.to_csv() + df = DataFrame({b"foo": ["a", "b"], "bar": ["c", "d"]}) + with pytest.raises(ValueError): + df.to_csv() + df = DataFrame({"foo": ["a", "b"], "bar": ["c", "d"]}, index=["A", b"B"]) + with pytest.raises(ValueError): + df.to_csv() + + # multi-indexes + iterables = [[b"A", b"B"], ["C", "D"]] + index = pd.MultiIndex.from_product(iterables, names=[b"f", b"s"]) + data = np.array([[0, 0], [0, 0], [0, 0], [0, 0]]) + df = pd.DataFrame(data, index=index) + + with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path: + df.to_csv(path) + import sys + + df.to_csv(sys.stdout) + with open(path) as csvfile: + output = csvfile.readlines() + + expected = [ + "f,s,0,1\n", + "A,C,0,0\n", + "A,D,0,0\n", + "B,C,0,0\n", + "B,D,0,0\n", + ] + assert output == expected + + # mixing of bytes and non-bytes in multi-indexes + iterables = [[b"A", "B"], ["C", "D"]] + index = pd.MultiIndex.from_product(iterables) + df = pd.DataFrame(data, index=index) + with pytest.raises(ValueError): + df.to_csv() + + iterables = [["A", "B"], ["C", "D"]] + index = pd.MultiIndex.from_product(iterables, names=[b"f", "s"]) + df = pd.DataFrame(data, index=index) + with pytest.raises(ValueError): + df.to_csv() + def test_to_csv_mixed(self): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)]