Skip to content

Fixes boolean indexing for strided masks #1370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ typedef size_t (*mask_positions_strided_impl_fn_ptr_t)(
size_t,
const char *,
int,
py::ssize_t,
const py::ssize_t *,
char *,
std::vector<sycl::event> const &);
Expand All @@ -434,7 +433,6 @@ size_t mask_positions_strided_impl(sycl::queue q,
size_t n_elems,
const char *mask,
int nd,
py::ssize_t input_offset,
const py::ssize_t *shape_strides,
char *cumsum,
std::vector<sycl::event> const &depends = {})
Expand All @@ -444,7 +442,7 @@ size_t mask_positions_strided_impl(sycl::queue q,
cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
size_t wg_size = 128;

StridedIndexer strided_indexer{nd, input_offset, shape_strides};
StridedIndexer strided_indexer{nd, 0, shape_strides};
NonZeroIndicator<maskT, cumsumT> non_zero_indicator{};

sycl::event comp_ev =
Expand Down
49 changes: 49 additions & 0 deletions dpctl/tensor/libtensor/include/utils/strided_iters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,55 @@ contract_iter4(vecT shape,
out_strides3, disp3, out_strides4, disp4);
}

/*
For purposes of iterating over elements of an array with `shape` and
strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
may modify memory and returns the new length of the array.

The new shape and new strides `(new_shape, new_strides)` are such that
iterating over them will traverse the same elements in the same order,
possibly with reduced dimensionality.
*/
template <class ShapeTy, class StridesTy>
int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
{
if (nd < 2)
return nd;

bool contractable = true;
for (int i = 0; i < nd; ++i) {
if (strides[i] < 0) {
contractable = false;
}
}

int nd_ = nd;
while (contractable) {
bool changed = false;
for (int i = 0; i + 1 < nd_; ++i) {
StridesTy str = strides[i + 1];
StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;

if (jump == str) {
changed = true;
shape[i] *= shape[i + 1];
for (int j = i; j < nd_; ++j) {
strides[j] = strides[j + 1];
}
for (int j = i + 1; j + 1 < nd_; ++j) {
shape[j] = shape[j + 1];
}
--nd_;
break;
}
}
if (!changed)
break;
}

return nd_;
}

} // namespace strides
} // namespace tensor
} // namespace dpctl
22 changes: 6 additions & 16 deletions dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,24 +205,14 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
auto const &strides_vector = mask.get_strides_vector();

using shT = std::vector<py::ssize_t>;
shT simplified_shape;
shT simplified_strides;
py::ssize_t offset(0);
shT compact_shape;
shT compact_strides;

int mask_nd = mask.get_ndim();
int nd = mask_nd;

dpctl::tensor::py_internal::simplify_iteration_space_1(
nd, shape, strides_vector, simplified_shape, simplified_strides,
offset);

if (nd == 1 && simplified_strides[0] == 1) {
auto fn = (use_i32)
? mask_positions_contig_i32_dispatch_vector[mask_typeid]
: mask_positions_contig_i64_dispatch_vector[mask_typeid];

return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
}
dpctl::tensor::py_internal::compact_iteration_space(
nd, shape, strides_vector, compact_shape, compact_strides);

// Strided implementation
auto strided_fn =
Expand All @@ -232,7 +222,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,

using dpctl::tensor::offset_utils::device_allocate_and_pack;
const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
exec_q, host_task_events, simplified_shape, simplified_strides);
exec_q, host_task_events, compact_shape, compact_strides);
py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
if (shape_strides == nullptr) {
sycl::event::wait(host_task_events);
Expand All @@ -253,7 +243,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
dependent_events.insert(dependent_events.end(), depends.begin(),
depends.end());

size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset,
size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
shape_strides, cumsum_data, dependent_events);

sycl::event::wait(host_task_events);
Expand Down
39 changes: 39 additions & 0 deletions dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,45 @@ void simplify_iteration_space_4(
}
}

void compact_iteration_space(int &nd,
const py::ssize_t *const &shape,
std::vector<py::ssize_t> const &strides,
// output
std::vector<py::ssize_t> &compact_shape,
std::vector<py::ssize_t> &compact_strides)
{
using dpctl::tensor::strides::compact_iteration;
if (nd > 1) {
// Compact iteration space to reduce dimensionality
// and improve access pattern
compact_shape.reserve(nd);
compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
assert(compact_shape.size() == static_cast<size_t>(nd));

compact_strides.reserve(nd);
compact_strides.insert(std::end(compact_strides), std::begin(strides),
std::end(strides));
assert(compact_strides.size() == static_cast<size_t>(nd));

int contracted_nd =
compact_iteration(nd, compact_shape.data(), compact_strides.data());
compact_shape.resize(contracted_nd);
compact_strides.resize(contracted_nd);

nd = contracted_nd;
}
else if (nd == 1) {
// Populate vectors
compact_shape.reserve(nd);
compact_shape.push_back(shape[0]);
assert(compact_shape.size() == static_cast<size_t>(nd));

compact_strides.reserve(nd);
compact_strides.push_back(strides[0]);
assert(compact_strides.size() == static_cast<size_t>(nd));
}
}

py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
std::vector<py::ssize_t> const &shape)
{
Expand Down
7 changes: 7 additions & 0 deletions dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ void simplify_iteration_space_4(int &,
py::ssize_t &,
py::ssize_t &);

void compact_iteration_space(int &,
const py::ssize_t *const &,
std::vector<py::ssize_t> const &,
// output
std::vector<py::ssize_t> &,
std::vector<py::ssize_t> &);

py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
std::vector<py::ssize_t> const &);
py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
Expand Down
45 changes: 45 additions & 0 deletions dpctl/tests/test_usm_ndarray_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,19 @@ def test_extract_all_1d():
res2 = dpt.extract(sel, x)
assert (dpt.asnumpy(res2) == expected_res).all()

# test strided case
x = dpt.arange(15, dtype="i4")
sel_np = np.zeros(15, dtype="?")
np.put(sel_np, np.random.choice(sel_np.size, size=7), True)
sel = dpt.asarray(sel_np)

res = x[sel[::-1]]
expected_res = dpt.asnumpy(x)[sel_np[::-1]]
assert (dpt.asnumpy(res) == expected_res).all()

res2 = dpt.extract(sel[::-1], x)
assert (dpt.asnumpy(res2) == expected_res).all()


def test_extract_all_2d():
get_queue_or_skip()
Expand Down Expand Up @@ -1287,6 +1300,38 @@ def test_nonzero():
assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()


def test_nonzero_f_contig():
"See gh-1370"
get_queue_or_skip

mask = dpt.zeros((5, 5), dtype="?", order="F")
mask[2, 3] = True

expected_res = (2, 3)
res = dpt.nonzero(mask)

assert expected_res == res
assert mask[res]


def test_nonzero_compacting():
"""See gh-1370.
Test with input where dimensionality
of iteration space is compacted from 3d to 2d
"""
get_queue_or_skip

mask = dpt.zeros((5, 5, 5), dtype="?", order="F")
mask[3, 2, 1] = True
mask_view = mask[..., :3]

expected_res = (3, 2, 1)
res = dpt.nonzero(mask_view)

assert expected_res == res
assert mask_view[res]


def test_assign_scalar():
get_queue_or_skip()
x = dpt.arange(-5, 5, dtype="i8")
Expand Down