diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 76f6e864a174f..3d8a174740498 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -874,6 +874,9 @@ I/O - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) - Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`) - Bug in :meth:`DataFrame.to_excel` when writing an empty dataframe with :class:`MultiIndex` (:issue:`19543`) +- Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x40 control bytes (:issue:`31243`) +- Bug in :func:`read_sas` that scrambled column names (:issue:`31243`) +- Period ^^^^^^ diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 2df3e1f7243da..9fcef64e07133 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -38,8 +38,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff) ipos += 1 elif control_byte == 0x40: # not documented - nbytes = end_of_first_byte * 16 - nbytes += (inbuff[ipos]) + nbytes = (inbuff[ipos] & 0xFF) + 18 + end_of_first_byte * 256 ipos += 1 for _ in range(nbytes): result[rpos] = inbuff[ipos] diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index a992c1af5ddaf..bb70532786fb1 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -182,9 +182,9 @@ def __init__( self.default_encoding = "latin-1" self.compression = b"" - self.column_names_strings: list[str] = [] - self.column_names: list[str] = [] - self.column_formats: list[str] = [] + self.column_names_raw: list[bytes] = [] + self.column_names: list[str | bytes] = [] + self.column_formats: list[str | bytes] = [] self.columns: list[_Column] = [] self._current_page_data_subheader_pointers: list[_SubheaderPointer] = [] @@ -278,17 +278,13 @@ def _get_properties(self) -> None: else: self.platform = "unknown" - buf = self._read_bytes(const.dataset_offset, const.dataset_length) - self.name = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.name = self.name.decode(self.encoding or self.default_encoding) + self.name = self._read_and_convert_header_text( + const.dataset_offset, const.dataset_length + ) - buf = self._read_bytes(const.file_type_offset, const.file_type_length) - self.file_type = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.file_type = self.file_type.decode( - self.encoding or self.default_encoding - ) + self.file_type = self._read_and_convert_header_text( + const.file_type_offset, const.file_type_length + ) # Timestamp is epoch 01/01/1960 epoch = datetime(1960, 1, 1) @@ -320,46 +316,25 @@ def _get_properties(self) -> None: const.page_count_offset + align1, const.page_count_length ) - buf = self._read_bytes( + self.sas_release_offset = self._read_and_convert_header_text( const.sas_release_offset + total_align, const.sas_release_length ) - self.sas_release = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.sas_release = self.sas_release.decode( - self.encoding or self.default_encoding - ) - buf = self._read_bytes( + self.server_type = self._read_and_convert_header_text( const.sas_server_type_offset + total_align, const.sas_server_type_length ) - self.server_type = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.server_type = self.server_type.decode( - self.encoding or self.default_encoding - ) - buf = self._read_bytes( + self.os_version = self._read_and_convert_header_text( const.os_version_number_offset + total_align, const.os_version_number_length ) - self.os_version = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.os_version = self.os_version.decode( - self.encoding or self.default_encoding - ) - buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length) - buf = buf.rstrip(b"\x00 ") - if len(buf) > 0: - self.os_name = buf.decode(self.encoding or self.default_encoding) - else: - buf = self._read_bytes( + self.os_name = self._read_and_convert_header_text( + const.os_name_offset + total_align, const.os_name_length + ) + if not self.os_name: + self.os_name = self._read_and_convert_header_text( const.os_maker_offset + total_align, const.os_maker_length ) - self.os_name = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.os_name = self.os_name.decode( - self.encoding or self.default_encoding - ) def __next__(self): da = self.read(nrows=self.chunksize or 1) @@ -402,6 +377,11 @@ def _read_bytes(self, offset: int, length: int): raise ValueError("The cached page is too small.") return self._cached_page[offset : offset + length] + def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: + return self._convert_header_text( + self._read_bytes(offset, length).rstrip(b"\x00 ") + ) + def _parse_metadata(self) -> None: done = False while not done: @@ -576,12 +556,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None: buf = self._read_bytes(offset, text_block_size) cname_raw = buf[0:text_block_size].rstrip(b"\x00 ") - cname = cname_raw - if self.convert_header_text: - cname = cname.decode(self.encoding or self.default_encoding) - self.column_names_strings.append(cname) + self.column_names_raw.append(cname_raw) - if len(self.column_names_strings) == 1: + if len(self.column_names_raw) == 1: compression_literal = b"" for cl in const.compression_literals: if cl in cname_raw: @@ -615,11 +592,8 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None: offset1 += 4 buf = self._read_bytes(offset1, self._lcs) self.creator_proc = buf[0 : self._lcp] - if self.convert_header_text: - if hasattr(self, "creator_proc"): - self.creator_proc = self.creator_proc.decode( - self.encoding or self.default_encoding - ) + if hasattr(self, "creator_proc"): + self.creator_proc = self._convert_header_text(self.creator_proc) def _process_columnname_subheader(self, offset: int, length: int) -> None: int_len = self._int_length @@ -650,8 +624,9 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None: ) col_len = self._read_int(col_name_length, const.column_name_length_length) - name_str = self.column_names_strings[idx] - self.column_names.append(name_str[col_offset : col_offset + col_len]) + name_raw = self.column_names_raw[idx] + cname = name_raw[col_offset : col_offset + col_len] + self.column_names.append(self._convert_header_text(cname)) def _process_columnattributes_subheader(self, offset: int, length: int) -> None: int_len = self._int_length @@ -699,7 +674,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None: x = self._read_int( text_subheader_format, const.column_format_text_subheader_index_length ) - format_idx = min(x, len(self.column_names_strings) - 1) + format_idx = min(x, len(self.column_names_raw) - 1) format_start = self._read_int( col_format_offset, const.column_format_offset_length @@ -709,15 +684,19 @@ def _process_format_subheader(self, offset: int, length: int) -> None: label_idx = self._read_int( text_subheader_label, const.column_label_text_subheader_index_length ) - label_idx = min(label_idx, len(self.column_names_strings) - 1) + label_idx = min(label_idx, len(self.column_names_raw) - 1) label_start = self._read_int(col_label_offset, const.column_label_offset_length) label_len = self._read_int(col_label_len, const.column_label_length_length) - label_names = self.column_names_strings[label_idx] - column_label = label_names[label_start : label_start + label_len] - format_names = self.column_names_strings[format_idx] - column_format = format_names[format_start : format_start + format_len] + label_names = self.column_names_raw[label_idx] + column_label = self._convert_header_text( + label_names[label_start : label_start + label_len] + ) + format_names = self.column_names_raw[format_idx] + column_format = self._convert_header_text( + format_names[format_start : format_start + format_len] + ) current_column_number = len(self.columns) col = _Column( @@ -815,9 +794,7 @@ def _chunk_to_dataframe(self) -> DataFrame: elif self._column_types[j] == b"s": rslt[name] = pd.Series(self._string_chunk[js, :], index=ix) if self.convert_text and (self.encoding is not None): - rslt[name] = rslt[name].str.decode( - self.encoding or self.default_encoding - ) + rslt[name] = self._decode_string(rslt[name].str) if self.blank_missing: ii = rslt[name].str.len() == 0 rslt[name][ii] = np.nan @@ -828,3 +805,12 @@ def _chunk_to_dataframe(self) -> DataFrame: df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False) return df + + def _decode_string(self, b): + return b.decode(self.encoding or self.default_encoding) + + def _convert_header_text(self, b: bytes) -> str | bytes: + if self.convert_header_text: + return self._decode_string(b) + else: + return b diff --git a/pandas/tests/io/sas/data/0x40controlbyte.csv b/pandas/tests/io/sas/data/0x40controlbyte.csv new file mode 100644 index 0000000000000..e81f5cc3904b7 --- /dev/null +++ b/pandas/tests/io/sas/data/0x40controlbyte.csv @@ -0,0 +1,2 @@ +long_string_field1,long_string_field2,long_string_field3 +00000000000000000000000000000000000000000000000000,11111111111111111111111111111111111111111111111111,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa diff --git a/pandas/tests/io/sas/data/0x40controlbyte.sas7bdat b/pandas/tests/io/sas/data/0x40controlbyte.sas7bdat new file mode 100644 index 0000000000000..013542e282e2f Binary files /dev/null and b/pandas/tests/io/sas/data/0x40controlbyte.sas7bdat differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 3f150c1a061ee..9724fcac815b5 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -381,3 +381,12 @@ def test_exception_propagation_rle_decompress(tmp_path, datapath): tmp_file.write_bytes(data) with pytest.raises(ValueError, match="unknown control byte"): pd.read_sas(tmp_file) + + +def test_0x40_control_byte(datapath): + # GH 31243 + fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") + df = pd.read_sas(fname, encoding="ascii") + fname = datapath("io", "sas", "data", "0x40controlbyte.csv") + df0 = pd.read_csv(fname, dtype="object") + tm.assert_frame_equal(df, df0)