From ddc5977d56e1708ae4824ff5e84fe692b1679de3 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sun, 27 Aug 2023 20:10:27 -0700
Subject: [PATCH 1/5] Corrected boolean indexing cumsum - The cumulative sum
 was being calculated incorrectly -- the offset from stride simplification was
 unused and the result was incorrect for some cases with non-C-contiguous
 strides - To fix this, new functions ``compact_iteration_space`` and
 complementary function ``compact_iteration`` have been implemented

---
 .../kernels/boolean_advanced_indexing.hpp     |  4 +-
 .../libtensor/include/utils/strided_iters.hpp | 52 +++++++++++++++++++
 .../source/boolean_advanced_indexing.cpp      | 17 +++---
 .../source/simplify_iteration_space.cpp       | 39 ++++++++++++++
 .../source/simplify_iteration_space.hpp       |  7 +++
 5 files changed, 106 insertions(+), 13 deletions(-)
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index bb2ddc5ad6..43c546860b 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -424,7 +424,6 @@ typedef size_t (*mask_positions_strided_impl_fn_ptr_t)(
     size_t,
     const char *,
     int,
-    py::ssize_t,
     const py::ssize_t *,
     char *,
     std::vector<sycl::event> const &);
@@ -434,7 +433,6 @@ size_t mask_positions_strided_impl(sycl::queue q,
                                    size_t n_elems,
                                    const char *mask,
                                    int nd,
-                                   py::ssize_t input_offset,
                                    const py::ssize_t *shape_strides,
                                    char *cumsum,
                                    std::vector<sycl::event> const &depends = {})
@@ -444,7 +442,7 @@ size_t mask_positions_strided_impl(sycl::queue q,
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
     size_t wg_size = 128;
 
-    StridedIndexer strided_indexer{nd, input_offset, shape_strides};
+    StridedIndexer strided_indexer{nd, 0, shape_strides};
     NonZeroIndicator<maskT, cumsumT> non_zero_indicator{};
 
     sycl::event comp_ev =
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
index f654087281..6f041d2fa1 100644
--- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -909,6 +909,58 @@ contract_iter4(vecT shape,
                            out_strides3, disp3, out_strides4, disp4);
 }
 
+/*
+    For purposes of iterating over pairs of elements of two arrays
+    with  `shape` and strides `strides1`, `strides2` given as pointers
+    `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr,
+    strides2_ptr, disp1, disp2)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that
+    iterating over them will traverse the same set of pairs of elements,
+    possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
+{
+    if (nd < 2)
+        return nd;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        if (strides[i] < 0) {
+            contractable = false;
+        }
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str = strides[i + 1];
+            StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;
+
+            if (jump == str) {
+                changed = true;
+                shape[i] *= shape[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides[j] = strides[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape[j] = shape[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+
+    return nd_;
+}
+
 } // namespace strides
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index 0dd63fe973..ae72f46794 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -205,22 +205,19 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
     auto const &strides_vector = mask.get_strides_vector();
 
     using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_strides;
-    py::ssize_t offset(0);
+    shT compact_shape;
+    shT compact_strides;
 
     int mask_nd = mask.get_ndim();
     int nd = mask_nd;
 
-    dpctl::tensor::py_internal::simplify_iteration_space_1(
-        nd, shape, strides_vector, simplified_shape, simplified_strides,
-        offset);
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
 
-    if (nd == 1 && simplified_strides[0] == 1) {
+    if (nd == 1 && compact_strides[0] == 1) {
         auto fn = (use_i32)
                       ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
                       : mask_positions_contig_i64_dispatch_vector[mask_typeid];
-
         return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
     }
 
@@ -232,7 +229,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
 
     using dpctl::tensor::offset_utils::device_allocate_and_pack;
     const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_strides);
+        exec_q, host_task_events, compact_shape, compact_strides);
     py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
     if (shape_strides == nullptr) {
         sycl::event::wait(host_task_events);
@@ -253,7 +250,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
     dependent_events.insert(dependent_events.end(), depends.begin(),
                             depends.end());
 
-    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset,
+    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
                                   shape_strides, cumsum_data, dependent_events);
 
     sycl::event::wait(host_task_events);
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
index 2fb2d6078e..31f5250f8a 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -369,6 +369,45 @@ void simplify_iteration_space_4(
     }
 }
 
+void compact_iteration_space(int &nd,
+                             const py::ssize_t *const &shape,
+                             std::vector<py::ssize_t> const &strides,
+                             // output
+                             std::vector<py::ssize_t> &compact_shape,
+                             std::vector<py::ssize_t> &compact_strides)
+{
+    using dpctl::tensor::strides::compact_iteration;
+    if (nd > 1) {
+        // Compact iteration space to reduce dimensionality
+        // and improve access pattern
+        compact_shape.reserve(nd);
+        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
+        assert(compact_shape.size() == static_cast<size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.insert(std::end(compact_strides), std::begin(strides),
+                               std::end(strides));
+        assert(compact_strides.size() == static_cast<size_t>(nd));
+
+        int contracted_nd =
+            compact_iteration(nd, compact_shape.data(), compact_strides.data());
+        compact_shape.resize(contracted_nd);
+        compact_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        compact_shape.reserve(nd);
+        compact_shape.push_back(shape[0]);
+        assert(compact_shape.size() == static_cast<size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.push_back(strides[0]);
+        assert(compact_strides.size() == static_cast<size_t>(nd));
+    }
+}
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
                                  std::vector<py::ssize_t> const &shape)
 {
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
index 1bd8ff5aa0..275129ad5c 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -90,6 +90,13 @@ void simplify_iteration_space_4(int &,
                                 py::ssize_t &,
                                 py::ssize_t &);
 
+void compact_iteration_space(int &,
+                             const py::ssize_t *const &,
+                             std::vector<py::ssize_t> const &,
+                             // output
+                             std::vector<py::ssize_t> &,
+                             std::vector<py::ssize_t> &);
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
                                  std::vector<py::ssize_t> const &);
 py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,

From 741dc7fe894315c01a77fb8b07183349a52171a1 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sun, 27 Aug 2023 21:16:18 -0700
Subject: [PATCH 2/5] Added tests for the corrected behavior of boolean
 indexing and nonzero

---
 dpctl/tests/test_usm_ndarray_indexing.py | 26 ++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 9d166226e7..c2a693b972 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -1044,6 +1044,19 @@ def test_extract_all_1d():
     res2 = dpt.extract(sel, x)
     assert (dpt.asnumpy(res2) == expected_res).all()
 
+    # test strided case
+    x = dpt.arange(15, dtype="i4")
+    sel_np = np.zeros(15, dtype="?")
+    np.put(sel_np, np.random.choice(sel_np.size, size=7), True)
+    sel = dpt.asarray(sel_np)
+
+    res = x[sel[::-1]]
+    expected_res = dpt.asnumpy(x)[sel_np[::-1]]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel[::-1], x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
 
 def test_extract_all_2d():
     get_queue_or_skip()
@@ -1287,6 +1300,19 @@ def test_nonzero():
     assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()
 
 
+def test_nonzero_f_contig():
+    get_queue_or_skip
+
+    mask = dpt.zeros((5, 5), dtype="?", order="F")
+    mask[2, 3] = True
+
+    expected_res = (2, 3)
+    res = dpt.nonzero(mask)
+
+    assert expected_res == res
+    assert mask[res]
+
+
 def test_assign_scalar():
     get_queue_or_skip()
     x = dpt.arange(-5, 5, dtype="i8")

From a1b1477af3c23d2021271223c8074eb9c003d617 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Aug 2023 23:00:14 -0500
Subject: [PATCH 3/5] Removed dead branch in py_mask_positions

Compacting strides can reduce dimensionality of the array, but it can not
turn an input that is not already C-contiguous into a C-contiguous one.

Hence the branch checking if the input became C-contiguous after compacting
is effectively dead.
---
 .../tensor/libtensor/source/boolean_advanced_indexing.cpp  | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index ae72f46794..d37967daef 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -214,13 +214,6 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
     dpctl::tensor::py_internal::compact_iteration_space(
         nd, shape, strides_vector, compact_shape, compact_strides);
 
-    if (nd == 1 && compact_strides[0] == 1) {
-        auto fn = (use_i32)
-                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
-                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
-        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
-    }
-
     // Strided implementation
     auto strided_fn =
         (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]

From e69271e73ca5cf9a19af8425d8d16e4d5da2975b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Aug 2023 23:02:42 -0500
Subject: [PATCH 4/5] Add a test for nonzero where dimension compacting occurs

---
 dpctl/tests/test_usm_ndarray_indexing.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index c2a693b972..9183226be2 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -1301,6 +1301,7 @@ def test_nonzero():
 
 
 def test_nonzero_f_contig():
+    "See gh-1370"
     get_queue_or_skip
 
     mask = dpt.zeros((5, 5), dtype="?", order="F")
@@ -1313,6 +1314,24 @@ def test_nonzero_f_contig():
     assert mask[res]
 
 
+def test_nonzero_compacting():
+    """See gh-1370.
+    Test with input where dimensionality
+    of iteration space is compacted from 3d to 2d
+    """
+    get_queue_or_skip
+
+    mask = dpt.zeros((5, 5, 5), dtype="?", order="F")
+    mask[3, 2, 1] = True
+    mask_view = mask[..., :3]
+
+    expected_res = (3, 2, 1)
+    res = dpt.nonzero(mask_view)
+
+    assert expected_res == res
+    assert mask_view[res]
+
+
 def test_assign_scalar():
     get_queue_or_skip()
     x = dpt.arange(-5, 5, dtype="i8")

From ab163acfd7142a0d9a46968677e9705b46b3f9b7 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 28 Aug 2023 07:36:15 -0700
Subject: [PATCH 5/5] Added docstring for compact_iteration

---
 .../libtensor/include/utils/strided_iters.hpp     | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
index 6f041d2fa1..bd174e3f90 100644
--- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -910,16 +910,13 @@ contract_iter4(vecT shape,
 }
 
 /*
-    For purposes of iterating over pairs of elements of two arrays
-    with  `shape` and strides `strides1`, `strides2` given as pointers
-    `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr,
-    strides2_ptr, disp1, disp2)`
-    may modify memory and returns new length of these arrays.
+    For purposes of iterating over elements of an array with  `shape` and
+    strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
+    may modify memory and returns the new length of the array.
 
-    The new shape and new strides, as well as the offset
-    `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that
-    iterating over them will traverse the same set of pairs of elements,
-    possibly in a different order.
+    The new shape and new strides `(new_shape, new_strides)` are such that
+    iterating over them will traverse the same elements in the same order,
+    possibly with reduced dimensionality.
  */
 template <class ShapeTy, class StridesTy>
 int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)