From ddc5977d56e1708ae4824ff5e84fe692b1679de3 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sun, 27 Aug 2023 20:10:27 -0700 Subject: [PATCH 1/5] Corrected boolean indexing cumsum - The cumulative sum was being calculated incorrectly -- the offset from stride simplification was unused and the result was incorrect for some cases with non-C-contiguous strides - To fix this, new functions ``compact_iteration_space`` and complementary function ``compact_iteration`` have been implemented --- .../kernels/boolean_advanced_indexing.hpp | 4 +- .../libtensor/include/utils/strided_iters.hpp | 52 +++++++++++++++++++ .../source/boolean_advanced_indexing.cpp | 17 +++--- .../source/simplify_iteration_space.cpp | 39 ++++++++++++++ .../source/simplify_iteration_space.hpp | 7 +++ 5 files changed, 106 insertions(+), 13 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index bb2ddc5ad6..43c546860b 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -424,7 +424,6 @@ typedef size_t (*mask_positions_strided_impl_fn_ptr_t)( size_t, const char *, int, - py::ssize_t, const py::ssize_t *, char *, std::vector const &); @@ -434,7 +433,6 @@ size_t mask_positions_strided_impl(sycl::queue q, size_t n_elems, const char *mask, int nd, - py::ssize_t input_offset, const py::ssize_t *shape_strides, char *cumsum, std::vector const &depends = {}) @@ -444,7 +442,7 @@ size_t mask_positions_strided_impl(sycl::queue q, cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); size_t wg_size = 128; - StridedIndexer strided_indexer{nd, input_offset, shape_strides}; + StridedIndexer strided_indexer{nd, 0, shape_strides}; NonZeroIndicator non_zero_indicator{}; sycl::event comp_ev = diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp index f654087281..6f041d2fa1 100644 --- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp +++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp @@ -909,6 +909,58 @@ contract_iter4(vecT shape, out_strides3, disp3, out_strides4, disp4); } +/* + For purposes of iterating over pairs of elements of two arrays + with `shape` and strides `strides1`, `strides2` given as pointers + `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr, + strides2_ptr, disp1, disp2)` + may modify memory and returns new length of these arrays. + + The new shape and new strides, as well as the offset + `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that + iterating over them will traverse the same set of pairs of elements, + possibly in a different order. + */ +template +int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides) +{ + if (nd < 2) + return nd; + + bool contractable = true; + for (int i = 0; i < nd; ++i) { + if (strides[i] < 0) { + contractable = false; + } + } + + int nd_ = nd; + while (contractable) { + bool changed = false; + for (int i = 0; i + 1 < nd_; ++i) { + StridesTy str = strides[i + 1]; + StridesTy jump = strides[i] - (shape[i + 1] - 1) * str; + + if (jump == str) { + changed = true; + shape[i] *= shape[i + 1]; + for (int j = i; j < nd_; ++j) { + strides[j] = strides[j + 1]; + } + for (int j = i + 1; j + 1 < nd_; ++j) { + shape[j] = shape[j + 1]; + } + --nd_; + break; + } + } + if (!changed) + break; + } + + return nd_; +} + } // namespace strides } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 0dd63fe973..ae72f46794 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -205,22 +205,19 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, auto const &strides_vector = mask.get_strides_vector(); using shT = std::vector; - shT simplified_shape; - shT simplified_strides; - py::ssize_t offset(0); + shT compact_shape; + shT compact_strides; int mask_nd = mask.get_ndim(); int nd = mask_nd; - dpctl::tensor::py_internal::simplify_iteration_space_1( - nd, shape, strides_vector, simplified_shape, simplified_strides, - offset); + dpctl::tensor::py_internal::compact_iteration_space( + nd, shape, strides_vector, compact_shape, compact_strides); - if (nd == 1 && simplified_strides[0] == 1) { + if (nd == 1 && compact_strides[0] == 1) { auto fn = (use_i32) ? mask_positions_contig_i32_dispatch_vector[mask_typeid] : mask_positions_contig_i64_dispatch_vector[mask_typeid]; - return fn(exec_q, mask_size, mask_data, cumsum_data, depends); } @@ -232,7 +229,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, using dpctl::tensor::offset_utils::device_allocate_and_pack; const auto &ptr_size_event_tuple = device_allocate_and_pack( - exec_q, host_task_events, simplified_shape, simplified_strides); + exec_q, host_task_events, compact_shape, compact_strides); py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); if (shape_strides == nullptr) { sycl::event::wait(host_task_events); @@ -253,7 +250,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, dependent_events.insert(dependent_events.end(), depends.begin(), depends.end()); - size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset, + size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data, dependent_events); sycl::event::wait(host_task_events); diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index 2fb2d6078e..31f5250f8a 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -369,6 +369,45 @@ void simplify_iteration_space_4( } } +void compact_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &compact_shape, + std::vector &compact_strides) +{ + using dpctl::tensor::strides::compact_iteration; + if (nd > 1) { + // Compact iteration space to reduce dimensionality + // and improve access pattern + compact_shape.reserve(nd); + compact_shape.insert(std::begin(compact_shape), shape, shape + nd); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.insert(std::end(compact_strides), std::begin(strides), + std::end(strides)); + assert(compact_strides.size() == static_cast(nd)); + + int contracted_nd = + compact_iteration(nd, compact_shape.data(), compact_strides.data()); + compact_shape.resize(contracted_nd); + compact_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + compact_shape.reserve(nd); + compact_shape.push_back(shape[0]); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.push_back(strides[0]); + assert(compact_strides.size() == static_cast(nd)); + } +} + py::ssize_t _ravel_multi_index_c(std::vector const &mi, std::vector const &shape) { diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp index 1bd8ff5aa0..275129ad5c 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp @@ -90,6 +90,13 @@ void simplify_iteration_space_4(int &, py::ssize_t &, py::ssize_t &); +void compact_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + // output + std::vector &, + std::vector &); + py::ssize_t _ravel_multi_index_c(std::vector const &, std::vector const &); py::ssize_t _ravel_multi_index_f(std::vector const &, From 741dc7fe894315c01a77fb8b07183349a52171a1 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sun, 27 Aug 2023 21:16:18 -0700 Subject: [PATCH 2/5] Added tests for the corrected behavior of boolean indexing and nonzero --- dpctl/tests/test_usm_ndarray_indexing.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 9d166226e7..c2a693b972 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -1044,6 +1044,19 @@ def test_extract_all_1d(): res2 = dpt.extract(sel, x) assert (dpt.asnumpy(res2) == expected_res).all() + # test strided case + x = dpt.arange(15, dtype="i4") + sel_np = np.zeros(15, dtype="?") + np.put(sel_np, np.random.choice(sel_np.size, size=7), True) + sel = dpt.asarray(sel_np) + + res = x[sel[::-1]] + expected_res = dpt.asnumpy(x)[sel_np[::-1]] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel[::-1], x) + assert (dpt.asnumpy(res2) == expected_res).all() + def test_extract_all_2d(): get_queue_or_skip() @@ -1287,6 +1300,19 @@ def test_nonzero(): assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all() +def test_nonzero_f_contig(): + get_queue_or_skip + + mask = dpt.zeros((5, 5), dtype="?", order="F") + mask[2, 3] = True + + expected_res = (2, 3) + res = dpt.nonzero(mask) + + assert expected_res == res + assert mask[res] + + def test_assign_scalar(): get_queue_or_skip() x = dpt.arange(-5, 5, dtype="i8") From a1b1477af3c23d2021271223c8074eb9c003d617 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 27 Aug 2023 23:00:14 -0500 Subject: [PATCH 3/5] Removed dead branch in py_mask_positions Compacting strides can reduce dimensionality of the array, but it can not turn an input that is not already C-contiguous into a C-contiguous one. Hence the branch checking if the input became C-contiguous after compacting is effectively dead. --- .../tensor/libtensor/source/boolean_advanced_indexing.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index ae72f46794..d37967daef 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -214,13 +214,6 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, dpctl::tensor::py_internal::compact_iteration_space( nd, shape, strides_vector, compact_shape, compact_strides); - if (nd == 1 && compact_strides[0] == 1) { - auto fn = (use_i32) - ? mask_positions_contig_i32_dispatch_vector[mask_typeid] - : mask_positions_contig_i64_dispatch_vector[mask_typeid]; - return fn(exec_q, mask_size, mask_data, cumsum_data, depends); - } - // Strided implementation auto strided_fn = (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid] From e69271e73ca5cf9a19af8425d8d16e4d5da2975b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 27 Aug 2023 23:02:42 -0500 Subject: [PATCH 4/5] Add a test for nonzero where dimension compacting occurs --- dpctl/tests/test_usm_ndarray_indexing.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index c2a693b972..9183226be2 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -1301,6 +1301,7 @@ def test_nonzero(): def test_nonzero_f_contig(): + "See gh-1370" get_queue_or_skip mask = dpt.zeros((5, 5), dtype="?", order="F") @@ -1313,6 +1314,24 @@ def test_nonzero_f_contig(): assert mask[res] +def test_nonzero_compacting(): + """See gh-1370. + Test with input where dimensionality + of iteration space is compacted from 3d to 2d + """ + get_queue_or_skip + + mask = dpt.zeros((5, 5, 5), dtype="?", order="F") + mask[3, 2, 1] = True + mask_view = mask[..., :3] + + expected_res = (3, 2, 1) + res = dpt.nonzero(mask_view) + + assert expected_res == res + assert mask_view[res] + + def test_assign_scalar(): get_queue_or_skip() x = dpt.arange(-5, 5, dtype="i8") From ab163acfd7142a0d9a46968677e9705b46b3f9b7 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 28 Aug 2023 07:36:15 -0700 Subject: [PATCH 5/5] Added docstring for compact_iteration --- .../libtensor/include/utils/strided_iters.hpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp index 6f041d2fa1..bd174e3f90 100644 --- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp +++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp @@ -910,16 +910,13 @@ contract_iter4(vecT shape, } /* - For purposes of iterating over pairs of elements of two arrays - with `shape` and strides `strides1`, `strides2` given as pointers - `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr, - strides2_ptr, disp1, disp2)` - may modify memory and returns new length of these arrays. + For purposes of iterating over elements of an array with `shape` and + strides `strides` given as pointers `compact_iteration(nd, shape, strides)` + may modify memory and returns the new length of the array. - The new shape and new strides, as well as the offset - `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that - iterating over them will traverse the same set of pairs of elements, - possibly in a different order. + The new shape and new strides `(new_shape, new_strides)` are such that + iterating over them will traverse the same elements in the same order, + possibly with reduced dimensionality. */ template int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)