From 29fec858e0f249789525c4e28c5da1e039cb675d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 1 May 2017 12:41:02 -0400 Subject: [PATCH 1/3] Add tocsr method and sorted checking pull out linear_loc to separate function --- sparse/core.py | 157 ++++++++++++++++++++++++++++---------- sparse/tests/test_core.py | 4 +- sparse/utils.py | 13 ++++ 3 files changed, 132 insertions(+), 42 deletions(-) diff --git a/sparse/core.py b/sparse/core.py index be9c06c7..0750e402 100644 --- a/sparse/core.py +++ b/sparse/core.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, print_function -from collections import Iterable +from collections import Iterable, defaultdict, deque from functools import reduce import numbers import operator @@ -9,6 +9,12 @@ import scipy.sparse +try: # Windows compatibility + int = long +except NameError: + pass + + class COO(object): """ A Sparse Multidimensional Array @@ -29,9 +35,9 @@ class COO(object): -------- >>> x = np.eye(4) >>> x[2, 3] = 5 - >>> s = COO.from_numpy(x) + >>> s = COO(x) >>> s - + >>> s.data array([ 1., 1., 1., 5., 1.]) >>> s.coords @@ -50,9 +56,9 @@ class COO(object): >>> data = [1, 2, 3, 4, 5] >>> y = COO(coords, data, shape=(3, 4, 5)) >>> y - + >>> tensordot(s, y, axes=(0, 1)) - + Following scipy.sparse conventions you can also pass these as a tuple with rows and columns @@ -73,7 +79,7 @@ class COO(object): >>> d = {(0, 0, 0): 1, (1, 2, 3): 2, (1, 1, 0): 3} >>> COO(d) - + >>> L = [((0, 0), 1), ... ((1, 1), 2), @@ -89,17 +95,21 @@ class COO(object): """ __array_priority__ = 12 - def __init__(self, coords, data=None, shape=None, has_duplicates=True): + def __init__(self, coords, data=None, shape=None, has_duplicates=True, + sorted=False): + self._cache = defaultdict(lambda: deque(maxlen=3)) if data is None: # {(i, j, k): x, (i, j, k): y, ...} if isinstance(coords, dict): coords = list(coords.items()) + has_duplicates = False if isinstance(coords, np.ndarray): result = COO.from_numpy(coords) self.coords = result.coords self.data = result.data self.has_duplicates = result.has_duplicates + self.sorted = result.sorted self.shape = result.shape return @@ -143,6 +153,7 @@ def __init__(self, coords, data=None, shape=None, has_duplicates=True): self.coords = self.coords.astype(dtype) assert not self.shape or len(data) == self.coords.shape[1] self.has_duplicates = has_duplicates + self.sorted = sorted @classmethod def from_numpy(cls, x): @@ -153,7 +164,8 @@ def from_numpy(cls, x): else: coords = [] data = x - return cls(coords, data, shape=x.shape) + return cls(coords, data, shape=x.shape, has_duplicates=False, + sorted=True) def todense(self): self = self.sum_duplicates() @@ -169,7 +181,9 @@ def from_scipy_sparse(cls, x): coords = np.empty((2, x.nnz), dtype=x.row.dtype) coords[0, :] = x.row coords[1, :] = x.col - return COO(coords, x.data, shape=x.shape, has_duplicates=not x.has_canonical_format) + return COO(coords, x.data, shape=x.shape, + has_duplicates=not x.has_canonical_format, + sorted=x.has_canonical_format) @property def dtype(self): @@ -235,11 +249,14 @@ def __getitem__(self, index): shape = tuple(shape) data = self.data[mask] - return COO(coords, data, shape=shape, has_duplicates=self.has_duplicates) + return COO(coords, data, shape=shape, + has_duplicates=self.has_duplicates, + sorted=self.sorted) def __str__(self): - return "" % (self.shape, self.dtype, - self.nnz) + return "" % ( + self.shape, self.dtype, self.nnz, self.sorted, + self.has_duplicates) __repr__ = __str__ @@ -272,6 +289,8 @@ def reduction(self, method, axis=None, keepdims=False, dtype=None): a = getattr(a, method)(axis=0, **kwargs) if isinstance(a, scipy.sparse.spmatrix): a = COO.from_scipy_sparse(a) + a.sorted = self.sorted + a.has_duplicates = False elif isinstance(a, np.matrix): a = np.asarray(a)[0] a = COO.from_numpy(a) @@ -328,22 +347,37 @@ def T(self): def dot(self, other): return dot(self, other) + def linear_loc(self, signed=False): + """ Index location of every piece of data in a flattened array + + This is used internally to check for duplicates, re-order, reshape, + etc.. + """ + n = reduce(operator.mul, self.shape) + if signed: + n = -n + dtype = np.min_scalar_type(n) + out = np.zeros(self.nnz, dtype=dtype) + tmp = np.zeros(self.nnz, dtype=dtype) + strides = 1 + for i, d in enumerate(self.shape[::-1]): + # out += self.coords[-(i + 1), :].astype(dtype) * strides + np.multiply(self.coords[-(i + 1), :], strides, out=tmp, dtype=dtype) + np.add(tmp, out, out=out) + strides *= d + return out + def reshape(self, shape): if self.shape == shape: return self if any(d == -1 for d in shape): - extra = np.uint64(np.prod(self.shape) / - np.prod([d for d in shape if d != -1])) + extra = int(np.prod(self.shape) / + np.prod([d for d in shape if d != -1])) shape = tuple([d if d != -1 else extra for d in shape]) if self.shape == shape: return self # TODO: this np.prod(self.shape) enforces a 2**64 limit to array size - dtype = np.min_scalar_type(np.prod(self.shape)) - linear_loc = np.zeros(self.nnz, dtype=dtype) - strides = 1 - for i, d in enumerate(self.shape[::-1]): - linear_loc += self.coords[-(i + 1), :].astype(dtype) * strides - strides *= d + linear_loc = self.linear_loc() coords = np.empty((len(shape), self.nnz), dtype=np.min_scalar_type(max(shape))) strides = 1 @@ -351,7 +385,9 @@ def reshape(self, shape): coords[-(i + 1), :] = (linear_loc // strides) % d strides *= d - return COO(coords, self.data, shape, has_duplicates=self.has_duplicates) + return COO(coords, self.data, shape, + has_duplicates=self.has_duplicates, + sorted=self.sorted) def to_scipy_sparse(self): assert self.ndim == 2 @@ -359,9 +395,25 @@ def to_scipy_sparse(self): (self.coords[0], self.coords[1])), shape=self.shape) - result.has_canonical_format = not self.has_duplicates + result.has_canonical_format = (not self.has_duplicates and self.sorted) return result + def _tocsr(self): + assert self.ndim == 2 + + # Pass 1: sum duplicates + self.sum_duplicates() + + # Pass 2: sort indices + self.sort_indices() + row, col = self.coords + + # Pass 3: count nonzeros in each row + indptr = np.zeros(self.shape[0] + 1, dtype=np.int64) + np.cumsum(np.bincount(row, minlength=self.shape[0]), out=indptr[1:]) + + return scipy.sparse.csr_matrix((self.data, col, indptr), shape=self.shape) + def tocsr(self): try: return self._csr @@ -373,10 +425,8 @@ def tocsr(self): except AttributeError: pass - coo = self.to_scipy_sparse() - csr = coo.tocsr() - self._csr = csr - return csr + self._csr = self._tocsr() + return self._csr def tocsc(self): try: @@ -388,10 +438,25 @@ def tocsc(self): return self._csc except AttributeError: pass - coo = self.to_scipy_sparse() - csc = coo.tocsc() - self._csc = csc - return csc + + self._csc = self.tocsr().tocsc() + return self._csc + + def sort_indices(self): + if self.sorted: + return + + linear = self.linear_loc(signed=True) + + if (np.diff(linear) > 0).all(): # already sorted + self.sorted = True + return self + + order = np.argsort(linear) + self.coords = self.coords[:, order] + self.data = self.data[order] + self.sorted = True + return self def sum_duplicates(self): # Inspired by scipy/sparse/coo.py::sum_duplicates @@ -400,15 +465,21 @@ def sum_duplicates(self): return self if not np.prod(self.coords.shape): return self - order = np.lexsort(self.coords) - coords = self.coords[:, order] - data = self.data[order] - unique_mask = (coords[:, 1:] != coords[:, :-1]).any(axis=0) + + self.sort_indices() + + linear = self.linear_loc() + unique_mask = np.diff(linear) != 0 + + if unique_mask.sum() == len(unique_mask): # already unique + self.has_duplicates = False + return self + unique_mask = np.append(True, unique_mask) - coords = coords[:, unique_mask] + coords = self.coords[:, unique_mask] (unique_inds,) = np.nonzero(unique_mask) - data = np.add.reduceat(data, unique_inds, dtype=data.dtype) + data = np.add.reduceat(self.data, unique_inds, dtype=self.data.dtype) self.data = data self.coords = coords @@ -430,7 +501,8 @@ def __radd__(self, other): return self + other def __neg__(self): - return COO(self.coords, -self.data, self.shape, self.has_duplicates) + return COO(self.coords, -self.data, self.shape, self.has_duplicates, + self.sorted) def __sub__(self, other): return self + (-other) @@ -462,7 +534,9 @@ def elemwise(self, func, *args, **kwargs): raise ValueError("Performing this operation would produce " "a dense result: %s" % str(func)) return COO(self.coords, func(self.data, *args, **kwargs), - shape=self.shape, has_duplicates=self.has_duplicates) + shape=self.shape, + has_duplicates=self.has_duplicates, + sorted=self.sorted) def elemwise_binary(self, func, other, *args, **kwargs): assert isinstance(other, COO) @@ -675,6 +749,7 @@ def tensordot(a, b, axes=2): res = res.todense() else: res = COO.from_scipy_sparse(res) # <--- modified + res.has_duplicates = False if isinstance(res, np.matrix): res = np.asarray(res) return res.reshape(olda + oldb) @@ -743,7 +818,8 @@ def concatenate(arrays, axis=0): shape[axis] = dim has_duplicates = any(x.has_duplicates for x in arrays) - return COO(coords, data, shape=shape, has_duplicates=has_duplicates) + return COO(coords, data, shape=shape, has_duplicates=has_duplicates, + sorted=(axis == 0) and all(a.sorted for a in arrays)) def stack(arrays, axis=0): @@ -769,4 +845,5 @@ def stack(arrays, axis=0): coords.insert(axis, new) coords = np.stack(coords, axis=0) - return COO(coords, data, shape=shape, has_duplicates=has_duplicates) + return COO(coords, data, shape=shape, has_duplicates=has_duplicates, + sorted=(axis == 0) and all(a.sorted for a in arrays)) diff --git a/sparse/tests/test_core.py b/sparse/tests/test_core.py index e531245f..7b231934 100644 --- a/sparse/tests/test_core.py +++ b/sparse/tests/test_core.py @@ -71,7 +71,7 @@ def test_large_reshape(): col = row % m # np.random.randint(0, m, size=n, dtype=np.uint16) data = np.ones(n, dtype=np.uint8) - x = COO((data, (row, col))) + x = COO((data, (row, col)), sorted=True, has_duplicates=False) assert_eq(x, x.reshape(x.shape)) @@ -366,8 +366,8 @@ def test_scalar_exponentiation(): def test_create_with_lists_of_tuples(): L = [((0, 0, 0), 1), - ((1, 1, 1), 2), ((1, 2, 1), 1), + ((1, 1, 1), 2), ((1, 3, 2), 3)] s = COO(L) diff --git a/sparse/utils.py b/sparse/utils.py index 76af90c2..cf58a225 100644 --- a/sparse/utils.py +++ b/sparse/utils.py @@ -1,9 +1,18 @@ import numpy as np +from .core import COO def assert_eq(x, y): assert x.shape == y.shape assert x.dtype == y.dtype + + if isinstance(x, COO): + if x.sorted: + assert is_lexsorted(x) + if isinstance(y, COO): + if y.sorted: + assert is_lexsorted(y) + if hasattr(x, 'todense'): xx = x.todense() else: @@ -13,3 +22,7 @@ def assert_eq(x, y): else: yy = y assert np.allclose(xx, yy) + + +def is_lexsorted(x): + return not x.shape or (np.diff(x.linear_loc()) > 0).all() From cebb4a6ee2221271e6bc402ac0dc4fcd7d03d622 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 1 May 2017 18:08:55 -0400 Subject: [PATCH 2/3] Cache reshape and transpose This stores the last few objects returned from reshape and transpose calls. This allows efficiencies from in-place operations like `sum_duplicates` and `sort_indices` to persist in interative workflows. Modern NumPy programmers are accustomed to operations like .transpose() being cheap and aren't accustomed to having to pay sorting costs after many computations. These assumptions are no longer true in sparse by default. However, by caching recent transpose and reshape objects we can reuse their inplace modifications. This greatly accelerates common machine learning workloads. --- sparse/core.py | 33 +++++++++++++++++++++------------ sparse/tests/test_core.py | 21 +++++++++++++-------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/sparse/core.py b/sparse/core.py index 0750e402..adea4807 100644 --- a/sparse/core.py +++ b/sparse/core.py @@ -154,6 +154,7 @@ def __init__(self, coords, data=None, shape=None, has_duplicates=True, assert not self.shape or len(data) == self.coords.shape[1] self.has_duplicates = has_duplicates self.sorted = sorted + self._cache = defaultdict(lambda: deque(maxlen=3)) @classmethod def from_numpy(cls, x): @@ -209,6 +210,9 @@ def __getitem__(self, index): index = (index,) index = tuple(ind + self.shape[i] if isinstance(ind, numbers.Integral) and ind < 0 else ind for i, ind in enumerate(index)) + if (all(ind == slice(None) or ind == slice(0, d) + for ind, d in zip(index, self.shape))): + return self mask = np.ones(self.nnz, dtype=bool) for i, ind in enumerate([i for i in index if i is not None]): if ind == slice(None, None): @@ -325,19 +329,15 @@ def transpose(self, axes=None): if axes == tuple(range(self.ndim)): return self + for ax, value in self._cache['transpose']: + if ax == axes: + return value + shape = tuple(self.shape[ax] for ax in axes) result = COO(self.coords[axes, :], self.data, shape, has_duplicates=self.has_duplicates) - if axes == (1, 0): - try: - result._csc = self._csr.T - except AttributeError: - pass - try: - result._csr = self._csc.T - except AttributeError: - pass + self._cache['transpose'].append((axes, result)) return result @property @@ -374,8 +374,14 @@ def reshape(self, shape): extra = int(np.prod(self.shape) / np.prod([d for d in shape if d != -1])) shape = tuple([d if d != -1 else extra for d in shape]) + if self.shape == shape: return self + + for sh, value in self._cache['reshape']: + if sh == shape: + return value + # TODO: this np.prod(self.shape) enforces a 2**64 limit to array size linear_loc = self.linear_loc() @@ -385,9 +391,12 @@ def reshape(self, shape): coords[-(i + 1), :] = (linear_loc // strides) % d strides *= d - return COO(coords, self.data, shape, - has_duplicates=self.has_duplicates, - sorted=self.sorted) + result = COO(coords, self.data, shape, + has_duplicates=self.has_duplicates, + sorted=self.sorted) + + self._cache['reshape'].append((shape, result)) + return result def to_scipy_sparse(self): assert self.ndim == 2 diff --git a/sparse/tests/test_core.py b/sparse/tests/test_core.py index 7b231934..5a58df8e 100644 --- a/sparse/tests/test_core.py +++ b/sparse/tests/test_core.py @@ -418,14 +418,6 @@ def test_cache_csr(): assert s.tocsr() is s.tocsr() assert s.tocsc() is s.tocsc() - st = s.T - - assert_eq(st._csr, st) - assert_eq(st._csc, st) - - assert isinstance(st.tocsr(), scipy.sparse.csr_matrix) - assert isinstance(st.tocsc(), scipy.sparse.csc_matrix) - def test_empty_shape(): x = COO([], [1.0]) @@ -469,3 +461,16 @@ def test_add_many_sparse_arrays(): x = COO({(1, 1): 1}) y = sum([x] * 100) assert y.nnz < np.prod(y.shape) + + +def test_caching(): + x = COO({(10, 10, 10): 1}) + + assert x[:].reshape((100, 10)).transpose().tocsr() is x[:].reshape((100, 10)).transpose().tocsr() + + x = COO({(1, 1, 1, 1, 1, 1, 1, 2): 1}) + + for i in range(x.ndim): + x.reshape((1,) * i + (2,) + (1,) * (x.ndim - i - 1)) + + assert len(x._cache['reshape']) < 5 From 2a887635b4bd60be7c89ea9c72e14d851fd31ece Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 3 May 2017 13:10:57 -0400 Subject: [PATCH 3/3] Opt in to caching --- sparse/core.py | 109 +++++++++++++++++++++++++------------- sparse/tests/test_core.py | 6 ++- 2 files changed, 77 insertions(+), 38 deletions(-) diff --git a/sparse/core.py b/sparse/core.py index adea4807..e363db34 100644 --- a/sparse/core.py +++ b/sparse/core.py @@ -96,8 +96,10 @@ class COO(object): __array_priority__ = 12 def __init__(self, coords, data=None, shape=None, has_duplicates=True, - sorted=False): - self._cache = defaultdict(lambda: deque(maxlen=3)) + sorted=False, cache=False): + self._cache = None + if cache: + self.enable_caching() if data is None: # {(i, j, k): x, (i, j, k): y, ...} if isinstance(coords, dict): @@ -154,7 +156,30 @@ def __init__(self, coords, data=None, shape=None, has_duplicates=True, assert not self.shape or len(data) == self.coords.shape[1] self.has_duplicates = has_duplicates self.sorted = sorted + + def enable_caching(self): + """ Enable caching of reshape, transpose, and tocsr/csc operations + + This enables efficient iterative workflows that make heavy use of + csr/csc operations, such as tensordot. This maintains a cache of + recent results of reshape and transpose so that operations like + tensordot (which uses both internally) store efficiently stored + representations for repeated use. This can significantly cut down on + computational costs in common numeric algorithms. + + However, this also assumes that neither this object, nor the downstream + objects will have their data mutated. + + Examples + -------- + >>> x.enable_caching() # doctest: +SKIP + >>> csr1 = x.transpose((2, 0, 1)).reshape((100, 120)).tocsr() # doctest: +SKIP + >>> csr2 = x.transpose((2, 0, 1)).reshape((100, 120)).tocsr() # doctest: +SKIP + >>> csr1 is csr2 # doctest: +SKIP + True + """ self._cache = defaultdict(lambda: deque(maxlen=3)) + return self @classmethod def from_numpy(cls, x): @@ -329,15 +354,18 @@ def transpose(self, axes=None): if axes == tuple(range(self.ndim)): return self - for ax, value in self._cache['transpose']: - if ax == axes: - return value + if self._cache is not None: + for ax, value in self._cache['transpose']: + if ax == axes: + return value shape = tuple(self.shape[ax] for ax in axes) result = COO(self.coords[axes, :], self.data, shape, - has_duplicates=self.has_duplicates) + has_duplicates=self.has_duplicates, + cache=self._cache is not None) - self._cache['transpose'].append((axes, result)) + if self._cache is not None: + self._cache['transpose'].append((axes, result)) return result @property @@ -378,9 +406,10 @@ def reshape(self, shape): if self.shape == shape: return self - for sh, value in self._cache['reshape']: - if sh == shape: - return value + if self._cache is not None: + for sh, value in self._cache['reshape']: + if sh == shape: + return value # TODO: this np.prod(self.shape) enforces a 2**64 limit to array size linear_loc = self.linear_loc() @@ -393,9 +422,10 @@ def reshape(self, shape): result = COO(coords, self.data, shape, has_duplicates=self.has_duplicates, - sorted=self.sorted) + sorted=self.sorted, cache=self._cache is not None) - self._cache['reshape'].append((shape, result)) + if self._cache is not None: + self._cache['reshape'].append((shape, result)) return result def to_scipy_sparse(self): @@ -424,32 +454,39 @@ def _tocsr(self): return scipy.sparse.csr_matrix((self.data, col, indptr), shape=self.shape) def tocsr(self): - try: - return self._csr - except AttributeError: - pass - try: - self._csr = self._csc.tocsr() - return self._csr - except AttributeError: - pass - - self._csr = self._tocsr() - return self._csr + if self._cache is not None: + try: + return self._csr + except AttributeError: + pass + try: + self._csr = self._csc.tocsr() + return self._csr + except AttributeError: + pass + + self._csr = csr = self._tocsr() + else: + csr = self._tocsr() + return csr def tocsc(self): - try: - return self._csc - except AttributeError: - pass - try: - self._csc = self._csr.tocsc() - return self._csc - except AttributeError: - pass - - self._csc = self.tocsr().tocsc() - return self._csc + if self._cache is not None: + try: + return self._csc + except AttributeError: + pass + try: + self._csc = self._csr.tocsc() + return self._csc + except AttributeError: + pass + + self._csc = csc = self.tocsr().tocsc() + else: + csc = self.tocsr().tocsc() + + return csc def sort_indices(self): if self.sorted: diff --git a/sparse/tests/test_core.py b/sparse/tests/test_core.py index 5a58df8e..d6c4f171 100644 --- a/sparse/tests/test_core.py +++ b/sparse/tests/test_core.py @@ -411,7 +411,7 @@ def test_scipy_sparse_interface(): def test_cache_csr(): x = random_x((10, 5)) - s = COO.from_numpy(x) + s = COO(x, cache=True) assert isinstance(s.tocsr(), scipy.sparse.csr_matrix) assert isinstance(s.tocsc(), scipy.sparse.csc_matrix) @@ -465,10 +465,12 @@ def test_add_many_sparse_arrays(): def test_caching(): x = COO({(10, 10, 10): 1}) + assert x[:].reshape((100, 10)).transpose().tocsr() is not x[:].reshape((100, 10)).transpose().tocsr() + x = COO({(10, 10, 10): 1}, cache=True) assert x[:].reshape((100, 10)).transpose().tocsr() is x[:].reshape((100, 10)).transpose().tocsr() - x = COO({(1, 1, 1, 1, 1, 1, 1, 2): 1}) + x = COO({(1, 1, 1, 1, 1, 1, 1, 2): 1}, cache=True) for i in range(x.ndim): x.reshape((1,) * i + (2,) + (1,) * (x.ndim - i - 1))