diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 3c7a80f096844..dce505729b3ea 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -149,7 +149,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). -- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`) +- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/sas/_byteswap.pyi b/pandas/io/sas/_byteswap.pyi new file mode 100644 index 0000000000000..bb0dbfc6a50b1 --- /dev/null +++ b/pandas/io/sas/_byteswap.pyi @@ -0,0 +1,5 @@ +def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... +def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... +def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... diff --git a/pandas/io/sas/byteswap.pyx b/pandas/io/sas/byteswap.pyx new file mode 100644 index 0000000000000..4620403910274 --- /dev/null +++ b/pandas/io/sas/byteswap.pyx @@ -0,0 +1,92 @@ +""" +The following are faster versions of struct.unpack that avoid the overhead of Python function calls. + +In the SAS7BDAT parser, they may be called up to (n_rows * n_cols) times. +""" +from cython cimport Py_ssize_t +from libc.stdint cimport ( + uint16_t, + uint32_t, + uint64_t, +) + + +def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 4 < len(data) + cdef: + const char *data_ptr = data + float res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap_float(res) + return res + + +def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 8 < len(data) + cdef: + const char *data_ptr = data + double res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap_double(res) + return res + + +def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 2 < len(data) + cdef: + const char *data_ptr = data + uint16_t res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap2(res) + return res + + +def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 4 < len(data) + cdef: + const char *data_ptr = data + uint32_t res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap4(res) + return res + + +def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): + assert offset + 8 < len(data) + cdef: + const char *data_ptr = data + uint64_t res = ((data_ptr + offset))[0] + if byteswap: + res = _byteswap8(res) + return res + + +# Byteswapping + +cdef extern from *: + """ + #ifdef _MSC_VER + #define _byteswap2 _byteswap_ushort + #define _byteswap4 _byteswap_ulong + #define _byteswap8 _byteswap_uint64 + #else + #define _byteswap2 __builtin_bswap16 + #define _byteswap4 __builtin_bswap32 + #define _byteswap8 __builtin_bswap64 + #endif + """ + uint16_t _byteswap2(uint16_t) + uint32_t _byteswap4(uint32_t) + uint64_t _byteswap8(uint64_t) + + +cdef inline float _byteswap_float(float num): + cdef uint32_t *intptr = &num + intptr[0] = _byteswap4(intptr[0]) + return num + + +cdef inline double _byteswap_double(double num): + cdef uint64_t *intptr = &num + intptr[0] = _byteswap8(intptr[0]) + return num diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 9f16e0def0882..a60c1eb025218 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -20,7 +20,7 @@ datetime, timedelta, ) -import struct +import sys from typing import cast import numpy as np @@ -42,6 +42,13 @@ ) from pandas.io.common import get_handle +from pandas.io.sas._byteswap import ( + read_double_with_byteswap, + read_float_with_byteswap, + read_uint16_with_byteswap, + read_uint32_with_byteswap, + read_uint64_with_byteswap, +) from pandas.io.sas._sas import ( Parser, get_subheader_index, @@ -263,8 +270,10 @@ def _get_properties(self) -> None: buf = self._read_bytes(const.endianness_offset, const.endianness_length) if buf == b"\x01": self.byte_order = "<" + self.need_byteswap = sys.byteorder == "big" else: self.byte_order = ">" + self.need_byteswap = sys.byteorder == "little" # Get encoding information buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0] @@ -286,7 +295,7 @@ def _get_properties(self) -> None: ) self.date_modified = epoch + pd.to_timedelta(x, unit="s") - self.header_length = self._read_int( + self.header_length = self._read_uint( const.header_size_offset + align1, const.header_size_length ) @@ -298,7 +307,7 @@ def _get_properties(self) -> None: if len(self._cached_page) != self.header_length: # type: ignore[arg-type] raise ValueError("The SAS7BDAT file appears to be truncated.") - self._page_length = self._read_int( + self._page_length = self._read_uint( const.page_size_offset + align1, const.page_size_length ) @@ -311,37 +320,46 @@ def __next__(self) -> DataFrame: # Read a single float of the given width (4 or 8). def _read_float(self, offset: int, width: int): - if width not in (4, 8): + assert self._cached_page is not None + if width == 4: + return read_float_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + elif width == 8: + return read_double_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + else: self.close() raise ValueError("invalid float width") - buf = self._read_bytes(offset, width) - fd = "f" if width == 4 else "d" - return struct.unpack(self.byte_order + fd, buf)[0] - # Read a single signed integer of the given width (1, 2, 4 or 8). - def _read_int(self, offset: int, width: int) -> int: - if width not in (1, 2, 4, 8): + # Read a single unsigned integer of the given width (1, 2, 4 or 8). + def _read_uint(self, offset: int, width: int) -> int: + assert self._cached_page is not None + if width == 1: + return self._read_bytes(offset, 1)[0] + elif width == 2: + return read_uint16_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + elif width == 4: + return read_uint32_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + elif width == 8: + return read_uint64_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + else: self.close() raise ValueError("invalid int width") - buf = self._read_bytes(offset, width) - it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] - iv = struct.unpack(self.byte_order + it, buf)[0] - return iv def _read_bytes(self, offset: int, length: int): - if self._cached_page is None: - self._path_or_buf.seek(offset) - buf = self._path_or_buf.read(length) - if len(buf) < length: - self.close() - msg = f"Unable to read {length:d} bytes from file position {offset:d}." - raise ValueError(msg) - return buf - else: - if offset + length > len(self._cached_page): - self.close() - raise ValueError("The cached page is too small.") - return self._cached_page[offset : offset + length] + assert self._cached_page is not None + if offset + length > len(self._cached_page): + self.close() + raise ValueError("The cached page is too small.") + return self._cached_page[offset : offset + length] def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: return self._convert_header_text( @@ -375,12 +393,12 @@ def _read_page_header(self) -> None: bit_offset = self._page_bit_offset tx = const.page_type_offset + bit_offset self._current_page_type = ( - self._read_int(tx, const.page_type_length) & const.page_type_mask2 + self._read_uint(tx, const.page_type_length) & const.page_type_mask2 ) tx = const.block_count_offset + bit_offset - self._current_page_block_count = self._read_int(tx, const.block_count_length) + self._current_page_block_count = self._read_uint(tx, const.block_count_length) tx = const.subheader_count_offset + bit_offset - self._current_page_subheaders_count = self._read_int( + self._current_page_subheaders_count = self._read_uint( tx, const.subheader_count_length ) @@ -391,16 +409,16 @@ def _process_page_metadata(self) -> None: offset = const.subheader_pointers_offset + bit_offset total_offset = offset + self._subheader_pointer_length * i - subheader_offset = self._read_int(total_offset, self._int_length) + subheader_offset = self._read_uint(total_offset, self._int_length) total_offset += self._int_length - subheader_length = self._read_int(total_offset, self._int_length) + subheader_length = self._read_uint(total_offset, self._int_length) total_offset += self._int_length - subheader_compression = self._read_int(total_offset, 1) + subheader_compression = self._read_uint(total_offset, 1) total_offset += 1 - subheader_type = self._read_int(total_offset, 1) + subheader_type = self._read_uint(total_offset, 1) if ( subheader_length == 0 @@ -442,29 +460,29 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None: lcs_offset += 354 lcp_offset += 378 - self.row_length = self._read_int( + self.row_length = self._read_uint( offset + const.row_length_offset_multiplier * int_len, int_len, ) - self.row_count = self._read_int( + self.row_count = self._read_uint( offset + const.row_count_offset_multiplier * int_len, int_len, ) - self.col_count_p1 = self._read_int( + self.col_count_p1 = self._read_uint( offset + const.col_count_p1_multiplier * int_len, int_len ) - self.col_count_p2 = self._read_int( + self.col_count_p2 = self._read_uint( offset + const.col_count_p2_multiplier * int_len, int_len ) mx = const.row_count_on_mix_page_offset_multiplier * int_len - self._mix_page_row_count = self._read_int(offset + mx, int_len) - self._lcs = self._read_int(lcs_offset, 2) - self._lcp = self._read_int(lcp_offset, 2) + self._mix_page_row_count = self._read_uint(offset + mx, int_len) + self._lcs = self._read_uint(lcs_offset, 2) + self._lcp = self._read_uint(lcp_offset, 2) def _process_columnsize_subheader(self, offset: int, length: int) -> None: int_len = self._int_length offset += int_len - self.column_count = self._read_int(offset, int_len) + self.column_count = self._read_uint(offset, int_len) if self.col_count_p1 + self.col_count_p2 != self.column_count: print( f"Warning: column count mismatch ({self.col_count_p1} + " @@ -478,7 +496,7 @@ def _process_subheader_counts(self, offset: int, length: int) -> None: def _process_columntext_subheader(self, offset: int, length: int) -> None: offset += self._int_length - text_block_size = self._read_int(offset, const.text_block_size_length) + text_block_size = self._read_uint(offset, const.text_block_size_length) buf = self._read_bytes(offset, text_block_size) cname_raw = buf[0:text_block_size].rstrip(b"\x00 ") @@ -542,13 +560,13 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None: + const.column_name_length_offset ) - idx = self._read_int( + idx = self._read_uint( text_subheader, const.column_name_text_subheader_length ) - col_offset = self._read_int( + col_offset = self._read_uint( col_name_offset, const.column_name_offset_length ) - col_len = self._read_int(col_name_length, const.column_name_length_length) + col_len = self._read_uint(col_name_length, const.column_name_length_length) name_raw = self.column_names_raw[idx] cname = name_raw[col_offset : col_offset + col_len] @@ -571,13 +589,13 @@ def _process_columnattributes_subheader(self, offset: int, length: int) -> None: offset + 2 * int_len + const.column_type_offset + i * (int_len + 8) ) - x = self._read_int(col_data_offset, int_len) + x = self._read_uint(col_data_offset, int_len) self._column_data_offsets.append(x) - x = self._read_int(col_data_len, const.column_data_length_length) + x = self._read_uint(col_data_len, const.column_data_length_length) self._column_data_lengths.append(x) - x = self._read_int(col_types, const.column_type_length) + x = self._read_uint(col_types, const.column_type_length) self._column_types.append(b"d" if x == 1 else b"s") def _process_columnlist_subheader(self, offset: int, length: int) -> None: @@ -597,23 +615,25 @@ def _process_format_subheader(self, offset: int, length: int) -> None: col_label_offset = offset + const.column_label_offset_offset + 3 * int_len col_label_len = offset + const.column_label_length_offset + 3 * int_len - x = self._read_int( + x = self._read_uint( text_subheader_format, const.column_format_text_subheader_index_length ) format_idx = min(x, len(self.column_names_raw) - 1) - format_start = self._read_int( + format_start = self._read_uint( col_format_offset, const.column_format_offset_length ) - format_len = self._read_int(col_format_len, const.column_format_length_length) + format_len = self._read_uint(col_format_len, const.column_format_length_length) - label_idx = self._read_int( + label_idx = self._read_uint( text_subheader_label, const.column_label_text_subheader_index_length ) label_idx = min(label_idx, len(self.column_names_raw) - 1) - label_start = self._read_int(col_label_offset, const.column_label_offset_length) - label_len = self._read_int(col_label_len, const.column_label_length_length) + label_start = self._read_uint( + col_label_offset, const.column_label_offset_length + ) + label_len = self._read_uint(col_label_len, const.column_label_length_length) label_names = self.column_names_raw[label_idx] column_label = self._convert_header_text( diff --git a/pandas/tests/io/sas/test_byteswap.py b/pandas/tests/io/sas/test_byteswap.py new file mode 100644 index 0000000000000..2c88907df3b1d --- /dev/null +++ b/pandas/tests/io/sas/test_byteswap.py @@ -0,0 +1,54 @@ +from hypothesis import ( + assume, + example, + given, + strategies as st, +) +import numpy as np +import pytest + +import pandas._testing as tm + +from pandas.io.sas._byteswap import ( + read_double_with_byteswap, + read_float_with_byteswap, + read_uint16_with_byteswap, + read_uint32_with_byteswap, + read_uint64_with_byteswap, +) + + +@given(read_offset=st.integers(0, 11), number=st.integers(min_value=0)) +@example(number=2**16, read_offset=0) +@example(number=2**32, read_offset=0) +@example(number=2**64, read_offset=0) +@pytest.mark.parametrize("int_type", [np.uint16, np.uint32, np.uint64]) +@pytest.mark.parametrize("should_byteswap", [True, False]) +def test_int_byteswap(read_offset, number, int_type, should_byteswap): + assume(number < 2 ** (8 * int_type(0).itemsize)) + _test(number, int_type, read_offset, should_byteswap) + + +@given(read_offset=st.integers(0, 11), number=st.floats()) +@pytest.mark.parametrize("float_type", [np.float32, np.float64]) +@pytest.mark.parametrize("should_byteswap", [True, False]) +def test_float_byteswap(read_offset, number, float_type, should_byteswap): + _test(number, float_type, read_offset, should_byteswap) + + +def _test(number, number_type, read_offset, should_byteswap): + number = number_type(number) + data = np.random.default_rng().integers(0, 256, size=20, dtype="uint8") + data[read_offset : read_offset + number.itemsize] = number[None].view("uint8") + swap_func = { + np.float32: read_float_with_byteswap, + np.float64: read_double_with_byteswap, + np.uint16: read_uint16_with_byteswap, + np.uint32: read_uint32_with_byteswap, + np.uint64: read_uint64_with_byteswap, + }[type(number)] + output_number = number_type(swap_func(bytes(data), read_offset, should_byteswap)) + if should_byteswap: + tm.assert_equal(output_number, number.byteswap()) + else: + tm.assert_equal(output_number, number) diff --git a/setup.py b/setup.py index a6691ae6f1047..0e489c4c9b017 100755 --- a/setup.py +++ b/setup.py @@ -226,6 +226,7 @@ class CheckSDist(sdist_class): "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", + "pandas/io/sas/byteswap.pyx", ] _cpp_pyxfiles = [ @@ -571,6 +572,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.window.indexers": {"pyxfile": "_libs/window/indexers"}, "_libs.writers": {"pyxfile": "_libs/writers"}, "io.sas._sas": {"pyxfile": "io/sas/sas"}, + "io.sas._byteswap": {"pyxfile": "io/sas/byteswap"}, } extensions = []