Skip to content

Commit 66165db

Browse files
committed
Fast byteswap
1 parent 9c241fe commit 66165db

File tree

2 files changed

+114
-14
lines changed

2 files changed

+114
-14
lines changed

pandas/io/sas/sas.pyx

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
# cython: profile=False
22
# cython: boundscheck=False, initializedcheck=False
33
from cython cimport Py_ssize_t
4+
from libc.stdint cimport (
5+
int64_t,
6+
uint8_t,
7+
uint16_t,
8+
uint32_t,
9+
uint64_t,
10+
)
11+
from libc.string cimport memcpy
12+
413
import numpy as np
514

615
import pandas.io.sas.sas_constants as const
716

8-
ctypedef signed long long int64_t
9-
ctypedef unsigned char uint8_t
10-
ctypedef unsigned short uint16_t
1117

1218
# rle_decompress decompresses data using a Run Length Encoding
1319
# algorithm. It is partially documented here:
@@ -434,3 +440,73 @@ cdef class Parser:
434440
self.current_row_on_page_index += 1
435441
self.current_row_in_chunk_index += 1
436442
self.current_row_in_file_index += 1
443+
444+
445+
def read_float_with_byteswap(const uint8_t *data, bint byteswap):
446+
cdef float res = (<float*>data)[0]
447+
if byteswap:
448+
res = _byteswap_float(res)
449+
return res
450+
451+
452+
def read_double_with_byteswap(const uint8_t *data, bint byteswap):
453+
cdef double res = (<double*>data)[0]
454+
if byteswap:
455+
res = _byteswap_double(res)
456+
return res
457+
458+
459+
def read_uint16_with_byteswap(const uint8_t *data, bint byteswap):
460+
cdef uint16_t res = (<uint16_t *>data)[0]
461+
if byteswap:
462+
res = _byteswap2(res)
463+
return res
464+
465+
466+
def read_uint32_with_byteswap(const uint8_t *data, bint byteswap):
467+
cdef uint32_t res = (<uint32_t *>data)[0]
468+
if byteswap:
469+
res = _byteswap4(res)
470+
return res
471+
472+
473+
def read_uint64_with_byteswap(const uint8_t *data, bint byteswap):
474+
cdef uint64_t res = (<uint64_t *>data)[0]
475+
if byteswap:
476+
res = _byteswap8(res)
477+
return res
478+
479+
480+
# Byteswapping
481+
# From https://github.com/WizardMac/ReadStat/blob/master/src/readstat_bits.
482+
# Copyright (c) 2013-2016 Evan Miller, Apache 2 License
483+
484+
cdef inline uint16_t _byteswap2(uint16_t num):
485+
return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8)
486+
487+
488+
cdef inline uint32_t _byteswap4(uint32_t num):
489+
num = ((num & <uint32_t>0xFFFF0000) >> 16) | ((num & <uint32_t>0x0000FFFF) << 16)
490+
return ((num & <uint32_t>0xFF00FF00) >> 8) | ((num & <uint32_t>0x00FF00FF) << 8)
491+
492+
493+
cdef inline uint64_t _byteswap8(uint64_t num):
494+
num = ((num & <uint64_t>0xFFFFFFFF00000000) >> 32) | ((num & <uint64_t>0x00000000FFFFFFFF) << 32)
495+
num = ((num & <uint64_t>0xFFFF0000FFFF0000) >> 16) | ((num & <uint64_t>0x0000FFFF0000FFFF) << 16)
496+
return ((num & <uint64_t>0xFF00FF00FF00FF00) >> 8) | ((num & <uint64_t>0x00FF00FF00FF00FF) << 8)
497+
498+
499+
cdef inline float _byteswap_float(float num):
500+
cdef uint32_t answer = 0
501+
memcpy(&answer, &num, 4)
502+
answer = _byteswap4(answer)
503+
memcpy(&num, &answer, 4)
504+
return num
505+
506+
507+
cdef inline double _byteswap_double(double num):
508+
cdef uint64_t answer = 0
509+
memcpy(&answer, &num, 8)
510+
answer = _byteswap8(answer)
511+
memcpy(&num, &answer, 8)
512+
return num

pandas/io/sas/sas7bdat.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
datetime,
2121
timedelta,
2222
)
23-
import struct
23+
import sys
2424
from typing import cast
2525

2626
import numpy as np
@@ -42,7 +42,14 @@
4242
)
4343

4444
from pandas.io.common import get_handle
45-
from pandas.io.sas._sas import Parser
45+
from pandas.io.sas._sas import (
46+
Parser,
47+
read_double_with_byteswap,
48+
read_float_with_byteswap,
49+
read_uint16_with_byteswap,
50+
read_uint32_with_byteswap,
51+
read_uint64_with_byteswap,
52+
)
4653
import pandas.io.sas.sas_constants as const
4754
from pandas.io.sas.sasreader import ReaderBase
4855

@@ -259,8 +266,10 @@ def _get_properties(self) -> None:
259266
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
260267
if buf == b"\x01":
261268
self.byte_order = "<"
269+
self.need_byteswap = sys.byteorder == "big"
262270
else:
263271
self.byte_order = ">"
272+
self.need_byteswap = sys.byteorder == "little"
264273

265274
# Get encoding information
266275
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
@@ -370,22 +379,37 @@ def __next__(self):
370379

371380
# Read a single float of the given width (4 or 8).
372381
def _read_float(self, offset: int, width: int):
373-
if width not in (4, 8):
382+
if width == 4:
383+
return read_float_with_byteswap(
384+
self._read_bytes(offset, 4), self.need_byteswap
385+
)
386+
elif width == 8:
387+
return read_double_with_byteswap(
388+
self._read_bytes(offset, 8), self.need_byteswap
389+
)
390+
else:
374391
self.close()
375392
raise ValueError("invalid float width")
376-
buf = self._read_bytes(offset, width)
377-
fd = "f" if width == 4 else "d"
378-
return struct.unpack(self.byte_order + fd, buf)[0]
379393

380394
# Read a single signed integer of the given width (1, 2, 4 or 8).
381395
def _read_int(self, offset: int, width: int) -> int:
382-
if width not in (1, 2, 4, 8):
396+
if width == 1:
397+
return self._read_bytes(offset, 1)[0]
398+
elif width == 2:
399+
return read_uint16_with_byteswap(
400+
self._read_bytes(offset, 2), self.need_byteswap
401+
)
402+
elif width == 4:
403+
return read_uint32_with_byteswap(
404+
self._read_bytes(offset, 4), self.need_byteswap
405+
)
406+
elif width == 8:
407+
return read_uint64_with_byteswap(
408+
self._read_bytes(offset, 8), self.need_byteswap
409+
)
410+
else:
383411
self.close()
384412
raise ValueError("invalid int width")
385-
buf = self._read_bytes(offset, width)
386-
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
387-
iv = struct.unpack(self.byte_order + it, buf)[0]
388-
return iv
389413

390414
def _read_bytes(self, offset: int, length: int):
391415
if self._cached_page is None:

0 commit comments

Comments
 (0)