Skip to content

PERF: HDFStore table writing performance improvements #3537

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 8, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ pandas 0.11.1
- will warn with a FrequencyWarning if you are attempting to append
an index with a different frequency than the existing
- support datelike columns with a timezone as data_columns (GH2852_)
- table writing performance improvements.

**API Changes**

Expand Down
25 changes: 18 additions & 7 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ def __init__(self, func, nrows, start=None, stop=None, chunksize=None):
self.stop = min(self.nrows,stop)

if chunksize is None:
chunksize = 50000
chunksize = 100000

self.chunksize = chunksize

Expand Down Expand Up @@ -2232,6 +2232,10 @@ def table(self):
""" return the table group (this is my storable) """
return self.storable

@property
def dtype(self):
return self.table.dtype

@property
def description(self):
return self.table.description
Expand Down Expand Up @@ -2848,7 +2852,7 @@ class AppendableTable(LegacyTable):
table_type = 'appendable'

def write(self, obj, axes=None, append=False, complib=None,
complevel=None, fletcher32=None, min_itemsize=None, chunksize=50000,
complevel=None, fletcher32=None, min_itemsize=None, chunksize=None,
expectedrows=None, **kwargs):

if not append and self.is_exists:
Expand Down Expand Up @@ -2905,18 +2909,26 @@ def write_data(self, chunksize):
[a.is_searchable for a in self.values_axes]).astype('u1')
values = [a.take_data() for a in self.values_axes]

# transpose the values so first dimension is last
values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ]

# write the chunks
if chunksize is None:
chunksize = 100000

rows = self.nrows_expected
chunks = int(rows / chunksize) + 1
for i in xrange(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, rows)
if start_i >= end_i:
break

self.write_data_chunk(
indexes=[a[start_i:end_i] for a in indexes],
mask=mask[start_i:end_i],
search=search,
values=[v[:, start_i:end_i] for v in values])
values=[v[start_i:end_i] for v in values])

def write_data_chunk(self, indexes, mask, search, values):

Expand All @@ -2929,7 +2941,7 @@ def write_data_chunk(self, indexes, mask, search, values):
try:
func = getattr(lib, "create_hdf_rows_%sd" % self.ndim)
args = list(indexes)
args.extend([mask, search, values])
args.extend([self.dtype, mask, search, values])
rows = func(*args)
except (Exception), detail:
raise Exception("cannot create row-data -> %s" % str(detail))
Expand All @@ -2939,9 +2951,8 @@ def write_data_chunk(self, indexes, mask, search, values):
self.table.append(rows)
self.table.flush()
except (Exception), detail:
raise Exception(
"tables cannot write this data -> %s" % str(detail))

raise Exception("tables cannot write this data -> %s" % str(detail))

def delete(self, where=None, **kwargs):

# delete all rows (and return the nrows)
Expand Down
106 changes: 59 additions & 47 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -837,61 +837,70 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr

@cython.boundscheck(False)
@cython.wraparound(False)
def create_hdf_rows_2d(ndarray indexer0,
def create_hdf_rows_2d(ndarray indexer0,
object dtype,
ndarray[np.uint8_t, ndim=1] mask,
ndarray[np.uint8_t, ndim=1] searchable,
list values):
list values):
""" return a list of objects ready to be converted to rec-array format """

cdef:
int i, b, n_indexer0, n_blocks, tup_size
list l
object tup, val, v
int i, l, b, n_indexer0, n_blocks, tup_size
ndarray result
tuple tup
object v

n_indexer0 = indexer0.shape[0]
n_blocks = len(values)
tup_size = n_blocks+1
l = []

for i from 0 <= i < n_indexer0:
result = np.empty(n_indexer0,dtype=dtype)
l = 0
for i in range(n_indexer0):

if not mask[i]:

tup = PyTuple_New(tup_size)
val = indexer0[i]
PyTuple_SET_ITEM(tup, 0, val)
Py_INCREF(val)

for b from 0 <= b < n_blocks:
v = indexer0[i]
PyTuple_SET_ITEM(tup, 0, v)
Py_INCREF(v)

for b in range(n_blocks):

v = values[b][:, i]
v = values[b][i]
if searchable[b]:
v = v[0]

PyTuple_SET_ITEM(tup, b+1, v)
Py_INCREF(v)

l.append(tup)
result[l] = tup
l += 1

return l
return result[0:l]

@cython.boundscheck(False)
@cython.wraparound(False)
def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
object dtype,
ndarray[np.uint8_t, ndim=2] mask,
ndarray[np.uint8_t, ndim=1] searchable,
list values):
""" return a list of objects ready to be converted to rec-array format """

cdef:
int i, j, b, n_indexer0, n_indexer1, n_blocks, tup_size
list l
object tup, val, v
int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size
tuple tup
object v
ndarray result

n_indexer0 = indexer0.shape[0]
n_indexer1 = indexer1.shape[0]
n_blocks = len(values)
tup_size = n_blocks+2
l = []
result = np.empty(n_indexer0*n_indexer1,dtype=dtype)
l = 0
for i from 0 <= i < n_indexer0:

for j from 0 <= j < n_indexer1:
Expand All @@ -900,45 +909,49 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,

tup = PyTuple_New(tup_size)

val = indexer0[i]
PyTuple_SET_ITEM(tup, 0, val)
Py_INCREF(val)

val = indexer1[j]
PyTuple_SET_ITEM(tup, 1, val)
Py_INCREF(val)
v = indexer0[i]
PyTuple_SET_ITEM(tup, 0, v)
Py_INCREF(v)
v = indexer1[j]
PyTuple_SET_ITEM(tup, 1, v)
Py_INCREF(v)

for b from 0 <= b < n_blocks:

v = values[b][:, i, j]
v = values[b][i, j]
if searchable[b]:
v = v[0]

PyTuple_SET_ITEM(tup, b+2, v)
Py_INCREF(v)

l.append(tup)
result[l] = tup
l += 1

return l
return result[0:l]

@cython.boundscheck(False)
@cython.wraparound(False)
def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
object dtype,
ndarray[np.uint8_t, ndim=3] mask,
ndarray[np.uint8_t, ndim=1] searchable,
list values):
""" return a list of objects ready to be converted to rec-array format """

cdef:
int i, j, k, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
list l
object tup, val, v
int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
tuple tup
object v
ndarray result

n_indexer0 = indexer0.shape[0]
n_indexer1 = indexer1.shape[0]
n_indexer2 = indexer2.shape[0]
n_blocks = len(values)
tup_size = n_blocks+3
l = []
result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype)
l = 0
for i from 0 <= i < n_indexer0:

for j from 0 <= j < n_indexer1:
Expand All @@ -949,29 +962,28 @@ def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,

tup = PyTuple_New(tup_size)

val = indexer0[i]
PyTuple_SET_ITEM(tup, 0, val)
Py_INCREF(val)

val = indexer1[j]
PyTuple_SET_ITEM(tup, 1, val)
Py_INCREF(val)

val = indexer2[k]
PyTuple_SET_ITEM(tup, 2, val)
Py_INCREF(val)
v = indexer0[i]
PyTuple_SET_ITEM(tup, 0, v)
Py_INCREF(v)
v = indexer1[j]
PyTuple_SET_ITEM(tup, 1, v)
Py_INCREF(v)
v = indexer2[k]
PyTuple_SET_ITEM(tup, 2, v)
Py_INCREF(v)

for b from 0 <= b < n_blocks:

v = values[b][:, i, j, k]
v = values[b][i, j, k]
if searchable[b]:
v = v[0]
PyTuple_SET_ITEM(tup, b+3, v)
Py_INCREF(v)

l.append(tup)
result[l] = tup
l += 1

return l
return result[0:l]

#-------------------------------------------------------------------------------
# Groupby-related functions
Expand Down