diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index f717f4b309..d0ad1c8205 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -18,7 +18,7 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     steps:
       - uses: actions/checkout@v3
         with:
@@ -63,7 +63,7 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     env:
       conda-bld: C:\Miniconda\conda-bld\win-64\
     steps:
@@ -102,7 +102,7 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
         experimental: [false]
         runner: [ubuntu-20.04]
     continue-on-error: ${{ matrix.experimental }}
@@ -185,7 +185,7 @@ jobs:
         shell: cmd /C CALL {0}
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
         experimental: [false]
         runner: [windows-latest]
     continue-on-error: ${{ matrix.experimental }}
@@ -300,7 +300,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     steps:
       - name: Download artifact
         uses: actions/download-artifact@v3
@@ -324,7 +324,7 @@ jobs:
     runs-on: windows-latest
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     steps:
       - name: Download artifact
         uses: actions/download-artifact@v3
@@ -608,7 +608,11 @@ jobs:
             echo "Array API standard conformance tests failed to run for dpctl=$PACKAGE_VERSION."
             exit 1
           fi
+      - name: Output API summary
+        shell: bash -l {0}
+        run: echo "::notice ${{ env.MESSAGE }}"
       - name: Post result to PR
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
         uses: mshick/add-pr-comment@v1
         with:
           message: |
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index 768d958e02..84bbed4622 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -49,7 +49,7 @@ jobs:
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
+          pip install numpy cython setuptools scikit-build cmake sphinx"<7.2" sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
@@ -76,7 +76,7 @@ jobs:
           mv ../cmake-install/docs/docs ~/docs
           git clean -dfx
       - name: Publish docs
-        if: ${{ github.ref == 'refs/heads/master' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.ref == 'refs/heads/master' }}
         shell: bash -l {0}
         run: |
           git remote add tokened_docs https://IntelPython:${{ secrets.GITHUB_TOKEN }}@github.com/IntelPython/dpctl.git
@@ -91,8 +91,15 @@ jobs:
           git config --global user.email 'github-actions[doc-deploy-bot]@users.noreply.github.com'
           git commit -m "Latest docs."
           git push tokened_docs gh-pages
+      - name: Save built docs as an artifact
+        if: ${{ github.event.pull_request && github.event.pull_request.head.repo.fork && github.event.action != 'closed'}}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} rendered documentation
+          path: ~/docs
+
       - name: Publish pull-request docs
-        if: ${{ github.event.pull_request && github.event.action != 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action != 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         shell: bash -l {0}
@@ -111,7 +118,7 @@ jobs:
           git commit -m "Docs for pull request ${PR_NUM}"
           git push tokened_docs gh-pages
       - name: Unpublish pull-request docs
-        if: ${{ github.event.pull_request && github.event.action == 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action == 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         shell: bash -l {0}
@@ -128,7 +135,7 @@ jobs:
           git commit -m "Removing docs for closed pull request ${PR_NUM}"
           git push tokened_docs gh-pages
       - name: Comment with URL to published pull-request docs
-        if: ${{ github.event.pull_request && github.event.action != 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action != 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         uses: mshick/add-pr-comment@v1
@@ -138,7 +145,7 @@ jobs:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           repo-token-user-login: 'github-actions[bot]'
       - name: Comment with URL about removal of PR docs
-        if: ${{ github.event.pull_request && github.event.action == 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action == 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         uses: mshick/add-pr-comment@v1
diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
index 55d57653f4..d12747f3a9 100644
--- a/.github/workflows/os-llvm-sycl-build.yml
+++ b/.github/workflows/os-llvm-sycl-build.yml
@@ -97,18 +97,18 @@ jobs:
       - name: Install system components
         shell: bash -l {0}
         run: |
-          sudo apt-get install ninja-build libtinfo5
+          sudo apt-get install libtinfo5
 
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.9'
+          python-version: '3.11'
           architecture: x64
 
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake
+          pip install numpy cython setuptools pytest scikit-build cmake ninja
 
       - name: Checkout repo
         uses: actions/checkout@v3
diff --git a/cmake/FindDpctl.cmake b/cmake/FindDpctl.cmake
index e917aaa194..fe75f3767f 100644
--- a/cmake/FindDpctl.cmake
+++ b/cmake/FindDpctl.cmake
@@ -17,15 +17,8 @@
 #
 
 if(NOT Dpctl_FOUND)
-  set(_find_extra_args)
-  if(Dpctl_FIND_REQUIRED)
-    list(APPEND _find_extra_args REQUIRED)
-  endif()
-  if(Dpctl_FIND_QUIET)
-    list(APPEND _find_extra_args QUIET)
-  endif()
-  find_package(PythonInterp ${_find_extra_args})
-  find_package(PythonLibs ${_find_extra_args})
+  find_package(Python 3.9 REQUIRED
+    COMPONENTS Interpreter Development.Module)
 
   if(PYTHON_EXECUTABLE)
     execute_process(COMMAND "${PYTHON_EXECUTABLE}"
diff --git a/conda-recipe/run_test.bat b/conda-recipe/run_test.bat
index 833ddfd21c..85cac031e7 100644
--- a/conda-recipe/run_test.bat
+++ b/conda-recipe/run_test.bat
@@ -3,8 +3,8 @@
 "%PYTHON%" -c "import dpctl; print(dpctl.__version__)"
 if errorlevel 1 exit 1
 
-"%PYTHON%" -c "import dpctl; dpctl.lsplatform()"
+"%PYTHON%" -m dpctl -f
 if errorlevel 1 exit 1
 
-python -m pytest -q -p no:faulthandler -ra --disable-warnings --pyargs dpctl -vv
+python -m pytest -q -ra --disable-warnings --pyargs dpctl -vv
 if errorlevel 1 exit 1
diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh
index 4b3566d7bd..63ae7996cc 100644
--- a/conda-recipe/run_test.sh
+++ b/conda-recipe/run_test.sh
@@ -3,5 +3,5 @@
 set -e
 
 ${PYTHON} -c "import dpctl; print(dpctl.__version__)"
-${PYTHON} -c "import dpctl; dpctl.lsplatform(verbosity=2)"
-${PYTHON} -m pytest -q -ra --disable-warnings -p no:faulthandler --cov dpctl --cov-report term-missing --pyargs dpctl -vv
+${PYTHON} -m dpctl -f
+${PYTHON} -m pytest -q -ra --disable-warnings --cov dpctl --cov-report term-missing --pyargs dpctl -vv
diff --git a/dpctl/_sycl_context.pyx b/dpctl/_sycl_context.pyx
index 9a9f7c5277..71c2fb7591 100644
--- a/dpctl/_sycl_context.pyx
+++ b/dpctl/_sycl_context.pyx
@@ -114,7 +114,7 @@ cdef class SyclContext(_SyclContext):
                 ctx = dpctl.SyclContext()
                 print(ctx.get_devices())
 
-        - Invoking the constuctor with a specific filter string that creates a
+        - Invoking the constructor with a specific filter string that creates a
           context for the device corresponding to the filter string.
 
         :Example:
@@ -127,7 +127,7 @@ cdef class SyclContext(_SyclContext):
                 d = ctx.get_devices()[0]
                 assert(d.is_gpu)
 
-        - Invoking the constuctor with a :class:`dpctl.SyclDevice` object
+        - Invoking the constructor with a :class:`dpctl.SyclDevice` object
           creates a context for that device.
 
         :Example:
@@ -141,7 +141,7 @@ cdef class SyclContext(_SyclContext):
                 d = ctx.get_devices()[0]
                 assert(d.is_gpu)
 
-        - Invoking the constuctor with a list of :class:`dpctl.SyclDevice`
+        - Invoking the constructor with a list of :class:`dpctl.SyclDevice`
           objects creates a common context for all the devices. This
           constructor call is especially useful when creation a context for
           multiple sub-devices.
@@ -159,7 +159,7 @@ cdef class SyclContext(_SyclContext):
                 ctx = dpctl.SyclContext(sub_devices)
                 assert(len(ctx.get_devices) == len(sub_devices))
 
-        - Invoking the constuctor with a named ``PyCapsule`` with name
+        - Invoking the constructor with a named ``PyCapsule`` with name
           **"SyclContextRef"** that carries a pointer to a ``sycl::context``
           object. The capsule will be renamed upon successful consumption
           to ensure one-time use. A new named capsule can be constructed by
@@ -430,7 +430,7 @@ cdef class SyclContext(_SyclContext):
             return num_devs
         else:
             raise ValueError(
-                "An error was encountered quering the number of devices "
+                "An error was encountered querying the number of devices "
                 "associated with this context"
             )
 
diff --git a/dpctl/_sycl_device.pyx b/dpctl/_sycl_device.pyx
index 6814dec677..50ce94d94c 100644
--- a/dpctl/_sycl_device.pyx
+++ b/dpctl/_sycl_device.pyx
@@ -1163,7 +1163,7 @@ cdef class SyclDevice(_SyclDevice):
     def create_sub_devices(self, **kwargs):
         """create_sub_devices(partition=parition_spec)
         Creates a list of sub-devices by partitioning a root device based on the
-        provided partion specifier.
+        provided partition specifier.
 
         A partition specifier must be provided using a "partition"
         keyword argument. Possible values for the specifier are: an int, a
diff --git a/dpctl/_sycl_event.pyx b/dpctl/_sycl_event.pyx
index 90165b4547..34576a2ef7 100644
--- a/dpctl/_sycl_event.pyx
+++ b/dpctl/_sycl_event.pyx
@@ -119,7 +119,7 @@ cdef class SyclEvent(_SyclEvent):
                 # Create a default SyclEvent
                 e = dpctl.SyclEvent()
 
-        - Invoking the constuctor with a named ``PyCapsule`` with name
+        - Invoking the constructor with a named ``PyCapsule`` with name
           **"SyclEventRef"** that carries a pointer to a ``sycl::event``
           object. The capsule will be renamed upon successful consumption
           to ensure one-time use. A new named capsule can be constructed by
diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx
index dbf0ae8385..6acf3396e1 100644
--- a/dpctl/_sycl_queue.pyx
+++ b/dpctl/_sycl_queue.pyx
@@ -246,7 +246,7 @@ cdef class SyclQueue(_SyclQueue):
                 # create a queue for each sub-device using the common context
                 queues = [dpctl.SyclQueue(ctx, sub_d) for sub_d in sub_devices]
 
-        - Invoking the constuctor with a named ``PyCapsule`` with the name
+        - Invoking the constructor with a named ``PyCapsule`` with the name
           **"SyclQueueRef"** that carries a pointer to a ``sycl::queue``
           object. The capsule will be renamed upon successful consumption
           to ensure one-time use. A new named capsule can be constructed by
diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py
index 0137549251..322272df2d 100644
--- a/dpctl/_sycl_timer.py
+++ b/dpctl/_sycl_timer.py
@@ -37,14 +37,14 @@ class SyclTimer:
             q = dpctl.SyclQueue(property='enable_profiling')
 
             # create the timer
-            miliseconds_sc = 1e-3
-            timer = dpctl.SyclTimer(time_scale = miliseconds_sc)
+            milliseconds_sc = 1e-3
+            timer = dpctl.SyclTimer(time_scale = milliseconds_sc)
 
             # use the timer
             with timer(queue=q):
                 code_block
 
-            # retrieve elapsed times in miliseconds
+            # retrieve elapsed times in milliseconds
             sycl_dt, wall_dt = timer.dt
 
     Remark:
diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp
index 1434da1f32..b529c41599 100644
--- a/dpctl/apis/include/dpctl4pybind11.hpp
+++ b/dpctl/apis/include/dpctl4pybind11.hpp
@@ -742,12 +742,12 @@ class usm_memory : public py::object
             return nullptr;
         }
 
-        auto convertor =
+        auto converter =
             ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj();
 
         py::object res;
         try {
-            res = convertor(py::handle(o));
+            res = converter(py::handle(o));
         } catch (const py::error_already_set &e) {
             return nullptr;
         }
diff --git a/dpctl/memory/_memory.pyx b/dpctl/memory/_memory.pyx
index 4ea19bfba0..e0900f870e 100644
--- a/dpctl/memory/_memory.pyx
+++ b/dpctl/memory/_memory.pyx
@@ -92,7 +92,7 @@ cdef void copy_via_host(void *dest_ptr, SyclQueue dest_queue,
                         void *src_ptr, SyclQueue src_queue, size_t nbytes):
     """
     Copies `nbytes` bytes from `src_ptr` USM memory to
-    `dest_ptr` USM memory using host as the intemediary.
+    `dest_ptr` USM memory using host as the intermediary.
 
     This is useful when `src_ptr` and `dest_ptr` are bound to incompatible
     SYCL contexts.
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index ca83b8350b..49f25aef6a 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -37,6 +37,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 7438fb8a67..ad51689f3f 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -60,6 +60,7 @@
 from dpctl.tensor._device import Device
 from dpctl.tensor._dlpack import from_dlpack
 from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take
+from dpctl.tensor._linear_algebra_functions import matrix_transpose
 from dpctl.tensor._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
@@ -199,6 +200,7 @@
     "tril",
     "triu",
     "where",
+    "matrix_transpose",
     "all",
     "any",
     "dtype",
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 3eae29f057..bc1b071460 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -56,7 +56,7 @@ def _copy_to_numpy(ary):
 
 def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
     "Copies numpy array `np_ary` into a new usm_ndarray"
-    # This may peform a copy to meet stated requirements
+    # This may perform a copy to meet stated requirements
     Xnp = np.require(np_ary, requirements=["A", "E"])
     alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None)
     dt = Xnp.dtype
@@ -517,6 +517,11 @@ def copy(usm_ary, order="K"):
        - "K": match the layout of `usm_ary` as closely as possible.
 
     """
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
     if not isinstance(usm_ary, dpt.usm_ndarray):
         return TypeError(
             f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
@@ -585,11 +590,11 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
         return TypeError(
             f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
         )
-    if not isinstance(order, str) or order not in ["A", "C", "F", "K"]:
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
         raise ValueError(
-            "Unrecognized value of the order keyword. "
-            "Recognized values are 'A', 'C', 'F', or 'K'"
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
         )
+    order = order[0].upper()
     ary_dtype = usm_ary.dtype
     target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
     if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
diff --git a/dpctl/tensor/_device.py b/dpctl/tensor/_device.py
index 63e9cee80f..30afacb435 100644
--- a/dpctl/tensor/_device.py
+++ b/dpctl/tensor/_device.py
@@ -96,7 +96,7 @@ def sycl_context(self):
     @property
     def sycl_device(self):
         """
-        :class:`dpctl.SyclDevice` targed by this :class:`.Device`.
+        :class:`dpctl.SyclDevice` targeted by this :class:`.Device`.
         """
         return self.sycl_queue_.sycl_device
 
diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 002b0ef5ec..7d46a3d814 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -29,6 +29,7 @@
 from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
 from ._type_utils import (
     _acceptance_fn_default,
+    _all_data_types,
     _find_buf_dtype,
     _find_buf_dtype2,
     _to_device_supported_dtype,
@@ -44,6 +45,7 @@ def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs):
         self.__name__ = "UnaryElementwiseFunc"
         self.name_ = name
         self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
         self.unary_fn_ = unary_dp_impl_fn
         self.__doc__ = docs
 
@@ -53,6 +55,18 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    @property
+    def types(self):
+        types = self.types_
+        if not types:
+            types = []
+            for dt1 in _all_data_types(True, True):
+                dt2 = self.result_type_resolver_fn_(dt1)
+                if dt2:
+                    types.append(f"{dt1.char}->{dt2.char}")
+            self.types_ = types
+        return types
+
     def __call__(self, x, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
             raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -363,6 +377,7 @@ def __init__(
         self.__name__ = "BinaryElementwiseFunc"
         self.name_ = name
         self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
         self.binary_fn_ = binary_dp_impl_fn
         self.binary_inplace_fn_ = binary_inplace_fn
         self.__doc__ = docs
@@ -377,6 +392,20 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    @property
+    def types(self):
+        types = self.types_
+        if not types:
+            types = []
+            _all_dtypes = _all_data_types(True, True)
+            for dt1 in _all_dtypes:
+                for dt2 in _all_dtypes:
+                    dt3 = self.result_type_resolver_fn_(dt1, dt2)
+                    if dt3:
+                        types.append(f"{dt1.char}{dt2.char}->{dt3.char}")
+            self.types_ = types
+        return types
+
     def __call__(self, o1, o2, out=None, order="K"):
         if order not in ["K", "C", "F", "A"]:
             order = "K"
@@ -439,7 +468,7 @@ def __call__(self, o1, o2, out=None, order="K"):
         o1_dtype = _get_dtype(o1, sycl_dev)
         o2_dtype = _get_dtype(o2, sycl_dev)
         if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)):
-            raise ValueError("Operands of unsupported types")
+            raise ValueError("Operands have unsupported data types")
 
         o1_dtype, o2_dtype = _resolve_weak_types(o1_dtype, o2_dtype, sycl_dev)
 
@@ -469,7 +498,7 @@ def __call__(self, o1, o2, out=None, order="K"):
             if out.shape != res_shape:
                 raise ValueError(
                     "The shape of input and output arrays are inconsistent. "
-                    f"Expected output shape is {o1_shape}, got {out.shape}"
+                    f"Expected output shape is {res_shape}, got {out.shape}"
                 )
 
             if res_dt != out.dtype:
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index fe85a183ba..8e2abee837 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -1628,7 +1628,7 @@
 _subtract_docstring_ = """
 subtract(x1, x2, out=None, order='K')
 
-Calculates the difference bewteen each element `x1_i` of the input
+Calculates the difference between each element `x1_i` of the input
 array `x1` and the respective element `x2_i` of the input array `x2`.
 
 Args:
diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py
new file mode 100644
index 0000000000..fd2c58b08a
--- /dev/null
+++ b/dpctl/tensor/_linear_algebra_functions.py
@@ -0,0 +1,48 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import dpctl.tensor as dpt
+
+
+def matrix_transpose(x):
+    """matrix_transpose(x)
+
+    Transposes the innermost two dimensions of `x`, where `x` is a
+    2-dimensional matrix or a stack of 2-dimensional matrices.
+
+    To convert from a 1-dimensional array to a 2-dimensional column
+    vector, use x[:, dpt.newaxis].
+
+    Args:
+       x (usm_ndarray):
+          Input array with shape (..., m, n).
+
+    Returns:
+       usm_ndarray:
+          Array with shape (..., n, m).
+    """
+
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if x.ndim < 2:
+        raise ValueError(
+            "dpctl.tensor.matrix_transpose requires array to have"
+            "at least 2 dimensions"
+        )
+
+    return x.mT
diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index 9406e386af..cb54556ed2 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -15,7 +15,8 @@
 #  limitations under the License.
 
 
-from itertools import chain, product, repeat
+import operator
+from itertools import chain, repeat
 
 import numpy as np
 from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
@@ -315,7 +316,7 @@ def broadcast_to(X, shape):
         raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
 
     # Use numpy.broadcast_to to check the validity of the input
-    # parametr 'shape'. Raise ValueError if 'X' is not compatible
+    # parameter 'shape'. Raise ValueError if 'X' is not compatible
     # with 'shape' according to NumPy's broadcasting rules.
     new_array = np.broadcast_to(
         np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape
@@ -426,10 +427,11 @@ def roll(X, shift, axis=None):
     if not isinstance(X, dpt.usm_ndarray):
         raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
     if axis is None:
+        shift = operator.index(shift)
         res = dpt.empty(
             X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue
         )
-        hev, _ = ti._copy_usm_ndarray_for_reshape(
+        hev, _ = ti._copy_usm_ndarray_for_roll_1d(
             src=X, dst=res, shift=shift, sycl_queue=X.sycl_queue
         )
         hev.wait()
@@ -438,31 +440,20 @@ def roll(X, shift, axis=None):
     broadcasted = np.broadcast(shift, axis)
     if broadcasted.ndim > 1:
         raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
-    shifts = {ax: 0 for ax in range(X.ndim)}
+    shifts = [
+        0,
+    ] * X.ndim
     for sh, ax in broadcasted:
         shifts[ax] += sh
-    rolls = [((np.s_[:], np.s_[:]),)] * X.ndim
-    for ax, offset in shifts.items():
-        offset %= X.shape[ax] or 1
-        if offset:
-            # (original, result), (original, result)
-            rolls[ax] = (
-                (np.s_[:-offset], np.s_[offset:]),
-                (np.s_[-offset:], np.s_[:offset]),
-            )
 
+    exec_q = X.sycl_queue
     res = dpt.empty(
-        X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue
+        X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q
     )
-    hev_list = []
-    for indices in product(*rolls):
-        arr_index, res_index = zip(*indices)
-        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=X[arr_index], dst=res[res_index], sycl_queue=X.sycl_queue
-        )
-        hev_list.append(hev)
-
-    dpctl.SyclEvent.wait_for(hev_list)
+    ht_e, _ = ti._copy_usm_ndarray_for_roll_nd(
+        src=X, dst=res, shifts=shifts, sycl_queue=exec_q
+    )
+    ht_e.wait()
     return res
 
 
@@ -550,7 +541,6 @@ def _concat_axis_None(arrays):
             hev, _ = ti._copy_usm_ndarray_for_reshape(
                 src=src_,
                 dst=res[fill_start:fill_end],
-                shift=0,
                 sycl_queue=exec_q,
             )
         fill_start = fill_end
@@ -757,7 +747,7 @@ def iinfo(dtype):
 
     Returns:
         iinfo_object:
-            An object with the followign attributes
+            An object with the following attributes
             * bits: int
                 number of bits occupied by the data type
             * max: int
diff --git a/dpctl/tensor/_print.py b/dpctl/tensor/_print.py
index 30bb353df7..97e485a7e7 100644
--- a/dpctl/tensor/_print.py
+++ b/dpctl/tensor/_print.py
@@ -148,7 +148,7 @@ def set_print_options(
         suppress (bool, optional): If `True,` numbers equal to zero
             in the current precision will print as zero.
             Default: `False`.
-        nanstr (str, optional): String used to repesent nan.
+        nanstr (str, optional): String used to represent nan.
             Raises `TypeError` if nanstr is not a string.
             Default: `"nan"`.
         infstr (str, optional): String used to represent infinity.
diff --git a/dpctl/tensor/_reshape.py b/dpctl/tensor/_reshape.py
index ac4a04cac4..b363c063de 100644
--- a/dpctl/tensor/_reshape.py
+++ b/dpctl/tensor/_reshape.py
@@ -165,7 +165,7 @@ def reshape(X, shape, order="C", copy=None):
         )
         if order == "C":
             hev, _ = _copy_usm_ndarray_for_reshape(
-                src=X, dst=flat_res, shift=0, sycl_queue=X.sycl_queue
+                src=X, dst=flat_res, sycl_queue=X.sycl_queue
             )
             hev.wait()
         else:
diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index 4f45d57391..4289284251 100644
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -104,7 +104,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
     Give basic slicing index `ind` and array layout information produce
     a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
        advanced_ind, resulting_advanced_ind_pos)
-    used to contruct a view into underlying array over which advanced
+    used to construct a view into underlying array over which advanced
     indexing, if any, is to be performed.
 
     Raises IndexError for invalid index `ind`.
@@ -201,7 +201,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
                 raise TypeError
         if ellipses_count > 1:
             raise IndexError(
-                "an index can only have a sinlge ellipsis ('...')")
+                "an index can only have a single ellipsis ('...')")
         if axes_referenced > len(shape):
             raise IndexError(
                 "too many indices for an array, array is "
diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi
index bc7349a5e6..4f12b989dc 100644
--- a/dpctl/tensor/_stride_utils.pxi
+++ b/dpctl/tensor/_stride_utils.pxi
@@ -55,7 +55,7 @@ cdef int _from_input_shape_strides(
         nelems - Number of elements in array
         min_disp = min( dot(strides, index), index for shape)
         max_disp = max( dor(strides, index), index for shape)
-        contig = enumation for array contiguity
+        contig = enumeration for array contiguity
     Returns: 0 on success, error code otherwise.
         On success pointers point to allocated arrays,
         Otherwise they are set to NULL
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 83e5e84759..ba18600135 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -52,6 +52,15 @@ cdef class InternalUSMArrayError(Exception):
     pass
 
 
+cdef object _as_zero_dim_ndarray(object usm_ary):
+    "Convert size-1 array to NumPy 0d array"
+    mem_view = dpmem.as_usm_memory(usm_ary)
+    host_buf = mem_view.copy_to_host()
+    view = host_buf.view(usm_ary.dtype)
+    view.shape = tuple()
+    return view
+
+
 cdef class usm_ndarray:
     """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \
            offset=0, order="C", buffer_ctor_kwargs=dict(), \
@@ -416,7 +425,7 @@ cdef class usm_ndarray:
         cdef char *ary_ptr = NULL
         if (not isinstance(self.base_, dpmem._memory._Memory)):
             raise InternalUSMArrayError(
-                "Invalid instance of usm_ndarray ecountered. "
+                "Invalid instance of usm_ndarray encountered. "
                 "Private field base_ has an unexpected type {}.".format(
                     type(self.base_)
                 )
@@ -557,7 +566,7 @@ cdef class usm_ndarray:
             elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
                 return _f_contig_strides(self.nd_, self.shape_)
             else:
-                raise ValueError("Inconsitent usm_ndarray data")
+                raise ValueError("Inconsistent usm_ndarray data")
 
     @property
     def flags(self):
@@ -644,7 +653,7 @@ cdef class usm_ndarray:
 
     @property
     def T(self):
-        """ Returns tranposed array for 2D array, raises `ValueError`
+        """ Returns transposed array for 2D array, raises `ValueError`
         otherwise.
         """
         if self.nd_ == 2:
@@ -662,7 +671,7 @@ cdef class usm_ndarray:
         """
         if self.nd_ < 2:
             raise ValueError(
-                "array.mT requires array to have at least 2-dimensons."
+                "array.mT requires array to have at least 2 dimensions."
             )
         return _m_transpose(self)
 
@@ -840,9 +849,7 @@ cdef class usm_ndarray:
 
     def __bool__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__bool__()
 
         if self.size == 0:
@@ -857,9 +864,7 @@ cdef class usm_ndarray:
 
     def __float__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__float__()
 
         raise ValueError(
@@ -868,9 +873,7 @@ cdef class usm_ndarray:
 
     def __complex__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__complex__()
 
         raise ValueError(
@@ -879,9 +882,7 @@ cdef class usm_ndarray:
 
     def __int__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__int__()
 
         raise ValueError(
@@ -1216,14 +1217,14 @@ cdef usm_ndarray _real_view(usm_ndarray ary):
     offset_elems = ary.get_offset() * 2
     r = usm_ndarray.__new__(
         usm_ndarray,
-        _make_int_tuple(ary.nd_, ary.shape_),
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
         dtype=_make_typestr(r_typenum_),
         strides=tuple(2 * si for si in ary.strides),
         buffer=ary.base_,
         offset=offset_elems,
         order=('C' if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F')
     )
-    r.flags_ = ary.flags_
+    r.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
     r.array_namespace_ = ary.array_namespace_
     return r
 
@@ -1248,14 +1249,14 @@ cdef usm_ndarray _imag_view(usm_ndarray ary):
     offset_elems = 2 * ary.get_offset() + 1
     r = usm_ndarray.__new__(
         usm_ndarray,
-        _make_int_tuple(ary.nd_, ary.shape_),
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
         dtype=_make_typestr(r_typenum_),
         strides=tuple(2 * si for si in ary.strides),
         buffer=ary.base_,
         offset=offset_elems,
         order=('C' if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F')
     )
-    r.flags_ = ary.flags_
+    r.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
     r.array_namespace_ = ary.array_namespace_
     return r
 
@@ -1375,8 +1376,8 @@ cdef api object UsmNDArray_MakeSimpleFromMemory(
         QRef: DPCTLSyclQueueRef associated with the allocation
         offset: distance between element with zero multi-index and the
                 start of allocation
-        oder: Memory layout of the array. Use 'C' for C-contiguous or
-              row-major layout; 'F' for F-contiguous or column-major layout
+        order: Memory layout of the array. Use 'C' for C-contiguous or
+               row-major layout; 'F' for F-contiguous or column-major layout
     Returns:
         Created usm_ndarray instance
     """
diff --git a/dpctl/tensor/include/dlpack/README.md b/dpctl/tensor/include/dlpack/README.md
index 02c594c0fe..2c22e9aa8d 100644
--- a/dpctl/tensor/include/dlpack/README.md
+++ b/dpctl/tensor/include/dlpack/README.md
@@ -4,4 +4,4 @@ The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remot
 
 The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/e2bdd3bee8cb6501558042633fa59144cc8b7f5f/include/dlpack/dlpack.h
 
-License file was retrived from https://github.com/dmlc/dlpack/blob/main/LICENSE
+License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE
diff --git a/dpctl/tensor/include/dlpack/dlpack.h b/dpctl/tensor/include/dlpack/dlpack.h
index 6d51801123..672448d1c6 100644
--- a/dpctl/tensor/include/dlpack/dlpack.h
+++ b/dpctl/tensor/include/dlpack/dlpack.h
@@ -168,7 +168,7 @@ typedef struct {
    * `byte_offset` field should be used to point to the beginning of the data.
    *
    * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
    * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
    * (after which this note will be updated); at the moment it is recommended
    * to not rely on the data pointer being correctly aligned.
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index bb2ddc5ad6..43c546860b 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -424,7 +424,6 @@ typedef size_t (*mask_positions_strided_impl_fn_ptr_t)(
     size_t,
     const char *,
     int,
-    py::ssize_t,
     const py::ssize_t *,
     char *,
     std::vector<sycl::event> const &);
@@ -434,7 +433,6 @@ size_t mask_positions_strided_impl(sycl::queue q,
                                    size_t n_elems,
                                    const char *mask,
                                    int nd,
-                                   py::ssize_t input_offset,
                                    const py::ssize_t *shape_strides,
                                    char *cumsum,
                                    std::vector<sycl::event> const &depends = {})
@@ -444,7 +442,7 @@ size_t mask_positions_strided_impl(sycl::queue q,
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
     size_t wg_size = 128;
 
-    StridedIndexer strided_indexer{nd, input_offset, shape_strides};
+    StridedIndexer strided_indexer{nd, 0, shape_strides};
     NonZeroIndicator<maskT, cumsumT> non_zero_indicator{};
 
     sycl::event comp_ev =
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index f1e63ccc60..e5aaa34903 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -56,9 +56,6 @@ class copy_cast_contig_kernel;
 template <typename srcT, typename dstT, typename IndexerT>
 class copy_cast_from_host_kernel;
 
-template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
-class copy_for_reshape_generic_kernel;
-
 template <typename srcTy, typename dstTy> class Caster
 {
 public:
@@ -118,7 +115,7 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
  * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
  `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
 
-   Both arrays have array dimensionality specied via argument `nd`. The
+   Both arrays have array dimensionality specified via argument `nd`. The
  `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
  first `nd` elements encode common shape, second `nd` elements contain strides
  of `src` array, and the trailing `nd` elements contain strides of `dst` array.
@@ -244,25 +241,26 @@ class ContigCopyFunctor
 
             if (base + n_vecs * vec_sz * sgSize < nelems &&
                 sgSize == max_sgSize) {
-                using src_ptrT =
-                    sycl::multi_ptr<const srcT,
-                                    sycl::access::address_space::global_space>;
-                using dst_ptrT =
-                    sycl::multi_ptr<dstT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<srcT, vec_sz> src_vec;
                 sycl::vec<dstT, vec_sz> dst_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    src_vec =
-                        sg.load<vec_sz>(src_ptrT(&src_p[base + it * sgSize]));
+                    auto src_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(
+                        &src_p[base + it * sgSize]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(
+                        &dst_p[base + it * sgSize]);
+
+                    src_vec = sg.load<vec_sz>(src_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; k++) {
                         dst_vec[k] = fn(src_vec[k]);
                     }
-                    sg.store<vec_sz>(dst_ptrT(&dst_p[base + it * sgSize]),
-                                     dst_vec);
+                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
                 }
             }
             else {
@@ -629,27 +627,24 @@ struct CopyAndCastFromHostFactory
 
 // =============== Copying for reshape ================== //
 
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_reshape_generic_kernel;
+
 template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
 class GenericCopyForReshapeFunctor
 {
 private:
-    py::ssize_t offset = 0;
-    py::ssize_t size = 1;
-    // USM array of size 2*(src_nd + dst_nd)
-    //   [ src_shape; src_strides; dst_shape; dst_strides ]
-    Ty *src_p = nullptr;
+    const Ty *src_p = nullptr;
     Ty *dst_p = nullptr;
     SrcIndexerT src_indexer_;
     DstIndexerT dst_indexer_;
 
 public:
-    GenericCopyForReshapeFunctor(py::ssize_t shift,
-                                 py::ssize_t nelems,
-                                 char *src_ptr,
+    GenericCopyForReshapeFunctor(const char *src_ptr,
                                  char *dst_ptr,
                                  SrcIndexerT src_indexer,
                                  DstIndexerT dst_indexer)
-        : offset(shift), size(nelems), src_p(reinterpret_cast<Ty *>(src_ptr)),
+        : src_p(reinterpret_cast<const Ty *>(src_ptr)),
           dst_p(reinterpret_cast<Ty *>(dst_ptr)), src_indexer_(src_indexer),
           dst_indexer_(dst_indexer)
     {
@@ -657,45 +652,36 @@ class GenericCopyForReshapeFunctor
 
     void operator()(sycl::id<1> wiid) const
     {
-        py::ssize_t this_src_offset = src_indexer_(wiid.get(0));
-        const Ty *in = src_p + this_src_offset;
-
-        py::ssize_t shifted_wiid =
-            (static_cast<py::ssize_t>(wiid.get(0)) + offset) % size;
-        shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size;
+        const py::ssize_t src_offset = src_indexer_(wiid.get(0));
+        const py::ssize_t dst_offset = dst_indexer_(wiid.get(0));
 
-        py::ssize_t this_dst_offset = dst_indexer_(shifted_wiid);
-
-        Ty *out = dst_p + this_dst_offset;
-        *out = *in;
+        dst_p[dst_offset] = src_p[src_offset];
     }
 };
 
 // define function type
 typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
     sycl::queue,
-    py::ssize_t, // shift
-    size_t,      // num_elements
-    int,
-    int,           // src_nd, dst_nd
+    size_t,        // num_elements
+    int,           // src_nd
+    int,           // dst_nd
     py::ssize_t *, // packed shapes and strides
-    char *,        // src_data_ptr
+    const char *,  // src_data_ptr
     char *,        // dst_data_ptr
     const std::vector<sycl::event> &);
 
 /*!
  * @brief Function to copy content of array while reshaping.
  *
- * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * Submits a kernel to perform a copy `dst[unravel_index(i,
  * dst.shape)] = src[unravel_undex(i, src.shape)]`.
  *
  * @param  q      The execution queue where kernel is submitted.
- * @param  shift  The shift in flat indexing.
  * @param  nelems The number of elements to copy
  * @param  src_nd Array dimension of the source array
  * @param  dst_nd Array dimension of the destination array
  * @param  packed_shapes_and_strides Kernel accessible USM array of size
- * `2*src_nd + 2*dst_nd` with contant `[src_shape, src_strides, dst_shape,
+ * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape,
  * dst_strides]`.
  * @param  src_p  Typeless USM pointer to the buffer of the source array
  * @param  dst_p  Typeless USM pointer to the buffer of the destination array
@@ -708,31 +694,40 @@ typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
 template <typename Ty>
 sycl::event
 copy_for_reshape_generic_impl(sycl::queue q,
-                              py::ssize_t shift,
                               size_t nelems,
                               int src_nd,
                               int dst_nd,
                               py::ssize_t *packed_shapes_and_strides,
-                              char *src_p,
+                              const char *src_p,
                               char *dst_p,
                               const std::vector<sycl::event> &depends)
 {
     dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
 
     sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
-        StridedIndexer src_indexer{
-            src_nd, 0,
-            const_cast<const py::ssize_t *>(packed_shapes_and_strides)};
-        StridedIndexer dst_indexer{
-            dst_nd, 0,
-            const_cast<const py::ssize_t *>(packed_shapes_and_strides +
-                                            (2 * src_nd))};
         cgh.depends_on(depends);
-        cgh.parallel_for<copy_for_reshape_generic_kernel<Ty, StridedIndexer,
-                                                         StridedIndexer>>(
+
+        // packed_shapes_and_strides:
+        //   USM array of size 2*(src_nd + dst_nd)
+        //   [ src_shape; src_strides; dst_shape; dst_strides ]
+
+        const py::ssize_t *src_shape_and_strides =
+            const_cast<const py::ssize_t *>(packed_shapes_and_strides);
+
+        const py::ssize_t *dst_shape_and_strides =
+            const_cast<const py::ssize_t *>(packed_shapes_and_strides +
+                                            (2 * src_nd));
+
+        StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
+        StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
+
+        using KernelName =
+            copy_for_reshape_generic_kernel<Ty, StridedIndexer, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
             sycl::range<1>(nelems),
             GenericCopyForReshapeFunctor<Ty, StridedIndexer, StridedIndexer>(
-                shift, nelems, src_p, dst_p, src_indexer, dst_indexer));
+                src_p, dst_p, src_indexer, dst_indexer));
     });
 
     return copy_for_reshape_ev;
@@ -752,6 +747,387 @@ template <typename fnT, typename Ty> struct CopyForReshapeGenericFactory
     }
 };
 
+// ================== Copying for roll ================== //
+
+/*! @brief Functor to cyclically roll global_id to the left */
+struct LeftRolled1DTransformer
+{
+    LeftRolled1DTransformer(size_t offset, size_t size)
+        : offset_(offset), size_(size)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        const size_t shifted_gid =
+            ((gid < offset_) ? gid + size_ - offset_ : gid - offset_);
+        return shifted_gid;
+    }
+
+private:
+    size_t offset_ = 0;
+    size_t size_ = 1;
+};
+
+/*! @brief Indexer functor to compose indexer and transformer */
+template <typename IndexerT, typename TransformerT> struct CompositionIndexer
+{
+    CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
+
+    auto operator()(size_t gid) const
+    {
+        return f_(t_(gid));
+    }
+
+private:
+    IndexerT f_;
+    TransformerT t_;
+};
+
+/*! @brief Indexer functor to find offset for nd-shifted indices lifted from
+ * iteration id */
+struct RolledNDIndexer
+{
+    RolledNDIndexer(int nd,
+                    const py::ssize_t *shape,
+                    const py::ssize_t *strides,
+                    const py::ssize_t *ndshifts,
+                    py::ssize_t starting_offset)
+        : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
+          starting_offset_(starting_offset)
+    {
+    }
+
+    py::ssize_t operator()(size_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+private:
+    int nd_ = -1;
+    const py::ssize_t *shape_ = nullptr;
+    const py::ssize_t *strides_ = nullptr;
+    const py::ssize_t *ndshifts_ = nullptr;
+    py::ssize_t starting_offset_ = 0;
+
+    py::ssize_t compute_offset(py::ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd_);
+        py::ssize_t relative_offset_(0);
+        _ind.get_left_rolled_displacement<const py::ssize_t *,
+                                          const py::ssize_t *>(
+            gid,
+            shape_,    // shape ptr
+            strides_,  // strides ptr
+            ndshifts_, // shifts ptr
+            relative_offset_);
+        return starting_offset_ + relative_offset_;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_strided_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class StridedCopyForRollFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    StridedCopyForRollFunctor(const Ty *src_ptr,
+                              Ty *dst_ptr,
+                              SrcIndexerT src_indexer,
+                              DstIndexerT dst_indexer)
+        : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const size_t gid = wiid.get(0);
+
+        const py::ssize_t src_offset = src_indexer_(gid);
+        const py::ssize_t dst_offset = dst_indexer_(gid);
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
+    sycl::queue,
+    size_t,              // shift
+    size_t,              // num_elements
+    int,                 // common_nd
+    const py::ssize_t *, // packed shapes and strides
+    const char *,        // src_data_ptr
+    py::ssize_t,         // src_offset
+    char *,              // dst_data_ptr
+    py::ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  nd     Array dimensionality of the destination and source arrays
+ * @param  packed_shapes_and_strides Kernel accessible USM array
+ * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of first element of src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of first element of dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event
+copy_for_roll_strided_impl(sycl::queue q,
+                           size_t shift,
+                           size_t nelems,
+                           int nd,
+                           const py::ssize_t *packed_shapes_and_strides,
+                           const char *src_p,
+                           py::ssize_t src_offset,
+                           char *dst_p,
+                           py::ssize_t dst_offset,
+                           const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 3 * nd
+        //   [ common_shape; src_strides; dst_strides ]
+
+        StridedIndexer src_indexer{nd, src_offset, packed_shapes_and_strides};
+        LeftRolled1DTransformer left_roll_transformer{shift, nelems};
+
+        using CompositeIndexerT =
+            CompositionIndexer<StridedIndexer, LeftRolled1DTransformer>;
+
+        CompositeIndexerT rolled_src_indexer(src_indexer,
+                                             left_roll_transformer);
+
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset,
+                                           packed_shapes_and_strides,
+                                           packed_shapes_and_strides + 2 * nd};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, CompositeIndexerT,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, CompositeIndexerT,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, rolled_src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+// define function type
+typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
+    sycl::queue,
+    size_t,       // shift
+    size_t,       // num_elements
+    const char *, // src_data_ptr
+    py::ssize_t,  // src_offset
+    char *,       // dst_data_ptr
+    py::ssize_t,  // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty> class copy_for_roll_contig_kernel;
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of the start of array src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of the start of array dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_contig_impl(sycl::queue q,
+                                      size_t shift,
+                                      size_t nelems,
+                                      const char *src_p,
+                                      py::ssize_t src_offset,
+                                      char *dst_p,
+                                      py::ssize_t dst_offset,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        NoOpIndexer src_indexer{};
+        LeftRolled1DTransformer roller{shift, nelems};
+
+        CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
+            left_rolled_src_indexer{src_indexer, roller};
+        NoOpIndexer dst_indexer{};
+
+        using KernelName = copy_for_roll_contig_kernel<Ty>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p) + src_offset;
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<
+                Ty, CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>,
+                NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer,
+                             dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty> struct CopyForRollStridedFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_strided_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty> struct CopyForRollContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_contig_impl<Ty>;
+        return f;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_ndshift_strided_kernel;
+
+// define function type
+typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
+    sycl::queue,
+    size_t,              // num_elements
+    int,                 // common_nd
+    const py::ssize_t *, // packed shape, strides, shifts
+    const char *,        // src_data_ptr
+    py::ssize_t,         // src_offset
+    char *,              // dst_data_ptr
+    py::ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+sycl::event copy_for_roll_ndshift_strided_impl(
+    sycl::queue q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *packed_shapes_and_strides_and_shifts,
+    const char *src_p,
+    py::ssize_t src_offset,
+    char *dst_p,
+    py::ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides_and_shifts:
+        //   USM array of size 4 * nd
+        //   [ common_shape; src_strides; dst_strides; shifts ]
+
+        const py::ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
+        const py::ssize_t *src_strides_ptr =
+            packed_shapes_and_strides_and_shifts + nd;
+        const py::ssize_t *dst_strides_ptr =
+            packed_shapes_and_strides_and_shifts + 2 * nd;
+        const py::ssize_t *shifts_ptr =
+            packed_shapes_and_strides_and_shifts + 3 * nd;
+
+        RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, shifts_ptr,
+                                    src_offset};
+
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr,
+                                           dst_strides_ptr};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, RolledNDIndexer,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, RolledNDIndexer,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty> struct CopyForRollNDShiftFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_ndshift_strided_impl<Ty>;
+        return f;
+    }
+};
+
 } // namespace copy_and_cast
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 855d5479c1..797a7f2534 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -65,9 +65,6 @@ struct UnaryContigFunctor
         if constexpr (UnaryOperatorT::is_constant::value) {
             // value of operator is known to be a known constant
             constexpr resT const_val = UnaryOperatorT::constant_value;
-            using out_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
 
             auto sg = ndit.get_sub_group();
             std::uint8_t sgSize = sg.get_local_range()[0];
@@ -80,8 +77,11 @@ struct UnaryContigFunctor
                 sycl::vec<resT, vec_sz> res_vec(const_val);
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -94,13 +94,6 @@ struct UnaryContigFunctor
         else if constexpr (UnaryOperatorT::supports_sg_loadstore::value &&
                            UnaryOperatorT::supports_vec::value)
         {
-            using in_ptrT =
-                sycl::multi_ptr<const argT,
-                                sycl::access::address_space::global_space>;
-            using out_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
-
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_local_range()[0];
             std::uint16_t max_sgSize = sg.get_max_local_range()[0];
@@ -113,10 +106,16 @@ struct UnaryContigFunctor
 
 #pragma unroll
                 for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    x = sg.load<vec_sz>(in_ptrT(&in[base + it * sgSize]));
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    x = sg.load<vec_sz>(in_multi_ptr);
                     sycl::vec<resT, vec_sz> res_vec = op(x);
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -141,23 +140,23 @@ struct UnaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (maxsgSize == sgSize)) {
-                using in_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT, vec_sz> arg_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec = sg.load<vec_sz>(in_ptrT(&in[base + it * sgSize]));
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         arg_vec[k] = op(arg_vec[k]);
                     }
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     arg_vec);
+                    sg.store<vec_sz>(out_multi_ptr, arg_vec);
                 }
             }
             else {
@@ -179,24 +178,24 @@ struct UnaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (maxsgSize == sgSize)) {
-                using in_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec = sg.load<vec_sz>(in_ptrT(&in[base + it * sgSize]));
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         res_vec[k] = op(arg_vec[k]);
                     }
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -365,28 +364,26 @@ struct BinaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using in_ptrT1 =
-                    sycl::multi_ptr<const argT1,
-                                    sycl::access::address_space::global_space>;
-                using in_ptrT2 =
-                    sycl::multi_ptr<const argT2,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg1_vec =
-                        sg.load<vec_sz>(in_ptrT1(&in1[base + it * sgSize]));
-                    arg2_vec =
-                        sg.load<vec_sz>(in_ptrT2(&in2[base + it * sgSize]));
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[base + it * sgSize]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg1_vec = sg.load<vec_sz>(in1_multi_ptr);
+                    arg2_vec = sg.load<vec_sz>(in2_multi_ptr);
                     res_vec = op(arg1_vec, arg2_vec);
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -407,32 +404,30 @@ struct BinaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using in_ptrT1 =
-                    sycl::multi_ptr<const argT1,
-                                    sycl::access::address_space::global_space>;
-                using in_ptrT2 =
-                    sycl::multi_ptr<const argT2,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg1_vec =
-                        sg.load<vec_sz>(in_ptrT1(&in1[base + it * sgSize]));
-                    arg2_vec =
-                        sg.load<vec_sz>(in_ptrT2(&in2[base + it * sgSize]));
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[base + it * sgSize]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg1_vec = sg.load<vec_sz>(in1_multi_ptr);
+                    arg2_vec = sg.load<vec_sz>(in2_multi_ptr);
 #pragma unroll
                     for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         res_vec[vec_id] =
                             op(arg1_vec[vec_id], arg2_vec[vec_id]);
                     }
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -530,22 +525,24 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
-            using in_ptrT1 =
-                sycl::multi_ptr<const argT1,
-                                sycl::access::address_space::global_space>;
-            using in_ptrT2 =
-                sycl::multi_ptr<const argT2,
-                                sycl::access::address_space::global_space>;
-            using res_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
-
-            const argT1 mat_el = sg.load(in_ptrT1(&mat[base]));
-            const argT2 vec_el = sg.load(in_ptrT2(&padded_vec[base % n1]));
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT1 mat_el = sg.load(in1_multi_ptr);
+            const argT2 vec_el = sg.load(in2_multi_ptr);
 
             resT res_el = op(mat_el, vec_el);
 
-            sg.store(res_ptrT(&res[base]), res_el);
+            sg.store(out_multi_ptr, res_el);
         }
         else {
             for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
@@ -592,22 +589,24 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
-            using in_ptrT1 =
-                sycl::multi_ptr<const argT1,
-                                sycl::access::address_space::global_space>;
-            using in_ptrT2 =
-                sycl::multi_ptr<const argT2,
-                                sycl::access::address_space::global_space>;
-            using res_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
-
-            const argT2 mat_el = sg.load(in_ptrT2(&mat[base]));
-            const argT1 vec_el = sg.load(in_ptrT1(&padded_vec[base % n1]));
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT2 mat_el = sg.load(in2_multi_ptr);
+            const argT1 vec_el = sg.load(in1_multi_ptr);
 
             resT res_el = op(vec_el, mat_el);
 
-            sg.store(res_ptrT(&res[base]), res_el);
+            sg.store(out_multi_ptr, res_el);
         }
         else {
             for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index a41029b27c..505a40acc5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -76,24 +76,24 @@ struct BinaryInplaceContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using rhs_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using lhs_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
+
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec =
-                        sg.load<vec_sz>(rhs_ptrT(&rhs[base + it * sgSize]));
-                    res_vec =
-                        sg.load<vec_sz>(lhs_ptrT(&lhs[base + it * sgSize]));
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
+                    res_vec = sg.load<vec_sz>(lhs_multi_ptr);
                     op(res_vec, arg_vec);
-                    sg.store<vec_sz>(lhs_ptrT(&lhs[base + it * sgSize]),
-                                     res_vec);
+
+                    sg.store<vec_sz>(lhs_multi_ptr, res_vec);
                 }
             }
             else {
@@ -115,27 +115,25 @@ struct BinaryInplaceContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using rhs_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using lhs_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec =
-                        sg.load<vec_sz>(rhs_ptrT(&rhs[base + it * sgSize]));
-                    res_vec =
-                        sg.load<vec_sz>(lhs_ptT(&lhs[base + it * sgSize]));
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
+                    res_vec = sg.load<vec_sz>(lhs_multi_ptr);
 #pragma unroll
                     for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         op(res_vec[vec_id], arg_vec[vec_id]);
                     }
-                    sg.store<vec_sz>(lhs_ptrT(&lhs[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(lhs_multi_ptr, res_vec);
                 }
             }
             else {
@@ -223,19 +221,20 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
-            using in_ptrT =
-                sycl::multi_ptr<const argT,
-                                sycl::access::address_space::global_space>;
-            using res_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
+            auto in_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
 
-            const argT vec_el = sg.load(in_ptrT(&padded_vec[base % n1]));
-            resT mat_el = sg.load(res_ptrT(&mat[base]));
+            const argT vec_el = sg.load(in_multi_ptr);
+            resT mat_el = sg.load(out_multi_ptr);
 
             op(mat_el, vec_el);
 
-            sg.store(res_ptrT(&mat[base]), mat_el);
+            sg.store(out_multi_ptr, mat_el);
         }
         else {
             for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index b996a6d0ec..e1c23113c6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -115,7 +115,10 @@ template <typename argT, typename resT> struct Expm1Functor
 
             // x, y finite numbers
             realT cosY_val;
-            const realT sinY_val = sycl::sincos(y, &cosY_val);
+            auto cosY_val_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::private_space,
+                sycl::access::decorated::yes>(&cosY_val);
+            const realT sinY_val = sycl::sincos(y, cosY_val_multi_ptr);
             const realT sinhalfY_val = std::sin(y / 2);
 
             const realT res_re =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 268c679f00..32e97df58d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -57,7 +57,12 @@ struct FloorDivideFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
-        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+        if constexpr (std::is_same_v<argT1, bool> &&
+                      std::is_same_v<argT2, bool>) {
+            return (in2) ? static_cast<resT>(in1) : resT(0);
+        }
+        else if constexpr (std::is_integral_v<argT1> ||
+                           std::is_integral_v<argT2>) {
             if (in2 == argT2(0)) {
                 return resT(0);
             }
@@ -81,7 +86,16 @@ struct FloorDivideFunctor
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
                                        const sycl::vec<argT2, vec_sz> &in2)
     {
-        if constexpr (std::is_integral_v<resT>) {
+        if constexpr (std::is_same_v<argT1, bool> &&
+                      std::is_same_v<argT2, bool>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                res[i] = (in2[i]) ? static_cast<resT>(in1[i]) : resT(0);
+            }
+            return res;
+        }
+        else if constexpr (std::is_integral_v<resT>) {
             sycl::vec<resT, vec_sz> res;
 #pragma unroll
             for (int i = 0; i < vec_sz; ++i) {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 808e82539e..9b9fa95061 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -150,11 +150,11 @@ template <typename argT, typename resT> struct SqrtFunctor
 
     int get_normal_scale_double(const double &v) const
     {
-        constexpr int float_significant_bits = 53;
+        constexpr int double_significant_bits = 52;
         constexpr std::uint64_t exponent_mask = 0x7ff;
         constexpr int exponent_bias = 1023;
         const int scale = static_cast<int>(
-            (sycl::bit_cast<std::uint64_t>(v) >> float_significant_bits) &
+            (sycl::bit_cast<std::uint64_t>(v) >> double_significant_bits) &
             exponent_mask);
         return scale - exponent_bias;
     }
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index c8aae0a3b9..fd3fcc9681 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -146,9 +146,9 @@ struct ReductionOverGroupWithAtomicFunctor
 
     void operator()(sycl::nd_item<1> it) const
     {
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t iter_gid = it.get_global_id(0) / red_gws_;
-        const size_t reduction_batch_id = get_reduction_batch_id(it);
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
@@ -204,14 +204,6 @@ struct ReductionOverGroupWithAtomicFunctor
             }
         }
     }
-
-private:
-    size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
-    {
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
-        return reduction_batch_id;
-    }
 };
 
 typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
@@ -241,6 +233,12 @@ class sum_reduction_seq_strided_krn;
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class sum_reduction_seq_contig_krn;
 
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_axis0_over_group_with_atomics_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_axis1_over_group_with_atomics_contig_krn;
+
 using dpctl::tensor::sycl_utils::choose_workgroup_size;
 
 template <typename argTy, typename resTy>
@@ -344,20 +342,6 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
-
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
@@ -395,7 +379,7 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
 
 /* @brief Reduce rows in a matrix */
 template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_with_atomics_contig_impl(
+sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
     sycl::queue exec_q,
     size_t iter_nelems, // number of reductions    (num. of rows in a matrix
                         // when reducing over rows)
@@ -417,7 +401,7 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<2>(reduction_nelems, sg_sizes);
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     if (reduction_nelems < wg) {
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
@@ -463,11 +447,11 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
                     RowsIndexerT, NoOpIndexerT>;
             using ReductionIndexerT = NoOpIndexerT;
 
-            RowsIndexerT columns_indexer{
+            RowsIndexerT rows_indexer{
                 0, static_cast<py::ssize_t>(iter_nelems),
                 static_cast<py::ssize_t>(reduction_nelems)};
             NoOpIndexerT result_indexer{};
-            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+            InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
                                                         result_indexer};
             ReductionIndexerT reduction_indexer{};
 
@@ -481,27 +465,95 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName =
+                class sum_reduction_axis1_over_group_with_atomics_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+
+        return comp_ev;
+    }
+}
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy>
+sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
+    sycl::queue exec_q,
+    size_t iter_nelems, // number of reductions    (num. of cols in a matrix
+                        // when reducing over cols)
+    size_t reduction_nelems, // size of each reduction  (length of cols, i.e.
+                             // number of rows)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    using ReductionOpT = sycl::plus<resTy>;
+    constexpr resTy identity_val = resTy{0};
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(res_init_ev);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
+                /* step */ static_cast<py::ssize_t>(iter_nelems)};
+
+            constexpr size_t preferrered_reductions_per_wi = 8;
+            size_t reductions_per_wi =
+                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
+                    : preferrered_reductions_per_wi;
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
 
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_with_atomics_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
+            using KernelName =
+                class sum_reduction_axis0_over_group_with_atomics_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(globalRange, localRange),
@@ -558,14 +610,13 @@ struct ReductionOverGroupNoAtomicFunctor
 
     void operator()(sycl::nd_item<1> it) const
     {
-
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t iter_gid = it.get_global_id(0) / red_gws_;
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
         // work-items sums over input with indices
         //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
         //   + reduction_lid
@@ -642,7 +693,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
 
     size_t reductions_per_wi(preferrered_reductions_per_wi);
     if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+        // reduction only requires 1 work-group, can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -1079,7 +1130,25 @@ struct SumOverAxisTempsStridedFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisAtomicContigFactory
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            return dpctl::tensor::kernels::
+                sum_reduction_axis1_over_group_with_atomics_contig_impl<srcTy,
+                                                                        dstTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
 {
     fnT get() const
     {
@@ -1087,7 +1156,8 @@ struct SumOverAxisAtomicContigFactory
                           srcTy, dstTy>::is_defined)
         {
             return dpctl::tensor::kernels::
-                sum_reduction_over_group_with_atomics_contig_impl<srcTy, dstTy>;
+                sum_reduction_axis0_over_group_with_atomics_contig_impl<srcTy,
+                                                                        dstTy>;
         }
         else {
             return nullptr;
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
index 67ce2ca1f0..9da5466dbe 100644
--- a/dpctl/tensor/libtensor/include/kernels/where.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/where.hpp
@@ -100,15 +100,6 @@ class WhereContigFunctor
 
             if (base + n_vecs * vec_sz * sgSize < nelems &&
                 sgSize == max_sgSize) {
-                using dst_ptrT =
-                    sycl::multi_ptr<T,
-                                    sycl::access::address_space::global_space>;
-                using x_ptrT =
-                    sycl::multi_ptr<const T,
-                                    sycl::access::address_space::global_space>;
-                using cond_ptrT =
-                    sycl::multi_ptr<const condT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<T, vec_sz> dst_vec;
                 sycl::vec<T, vec_sz> x1_vec;
                 sycl::vec<T, vec_sz> x2_vec;
@@ -117,14 +108,27 @@ class WhereContigFunctor
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
                     auto idx = base + it * sgSize;
-                    x1_vec = sg.load<vec_sz>(x_ptrT(&x1_p[idx]));
-                    x2_vec = sg.load<vec_sz>(x_ptrT(&x2_p[idx]));
-                    cond_vec = sg.load<vec_sz>(cond_ptrT(&cond_p[idx]));
+                    auto x1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x1_p[idx]);
+                    auto x2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x2_p[idx]);
+                    auto cond_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&cond_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    x1_vec = sg.load<vec_sz>(x1_multi_ptr);
+                    x2_vec = sg.load<vec_sz>(x2_multi_ptr);
+                    cond_vec = sg.load<vec_sz>(cond_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
                     }
-                    sg.store<vec_sz>(dst_ptrT(&dst_p[idx]), dst_vec);
+                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
                 }
             }
             else {
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
index ac02d26bf0..19bcf9d0a8 100644
--- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -91,7 +91,7 @@ device_allocate_and_pack(sycl::queue q,
 {
 
     // memory transfer optimization, use USM-host for temporary speeds up
-    // tranfer to device, especially on dGPUs
+    // transfer to device, especially on dGPUs
     using usm_host_allocatorT =
         sycl::usm_allocator<indT, sycl::usm::alloc::host>;
     using shT = std::vector<indT, usm_host_allocatorT>;
@@ -144,12 +144,12 @@ struct StridedIndexer
     {
     }
 
-    size_t operator()(py::ssize_t gid) const
+    py::ssize_t operator()(py::ssize_t gid) const
     {
         return compute_offset(gid);
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
         return compute_offset(static_cast<py::ssize_t>(gid));
     }
@@ -159,7 +159,7 @@ struct StridedIndexer
     py::ssize_t starting_offset;
     py::ssize_t const *shape_strides;
 
-    size_t compute_offset(py::ssize_t gid) const
+    py::ssize_t compute_offset(py::ssize_t gid) const
     {
         using dpctl::tensor::strides::CIndexer_vector;
 
@@ -185,12 +185,12 @@ struct UnpackedStridedIndexer
     {
     }
 
-    size_t operator()(py::ssize_t gid) const
+    py::ssize_t operator()(py::ssize_t gid) const
     {
         return compute_offset(gid);
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
         return compute_offset(static_cast<py::ssize_t>(gid));
     }
@@ -201,7 +201,7 @@ struct UnpackedStridedIndexer
     py::ssize_t const *shape;
     py::ssize_t const *strides;
 
-    size_t compute_offset(py::ssize_t gid) const
+    py::ssize_t compute_offset(py::ssize_t gid) const
     {
         using dpctl::tensor::strides::CIndexer_vector;
 
@@ -223,11 +223,10 @@ struct Strided1DIndexer
     {
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
         // ensure 0 <= gid < size
-        return static_cast<size_t>(offset +
-                                   std::min<size_t>(gid, size - 1) * step);
+        return offset + std::min<size_t>(gid, size - 1) * step;
     }
 
 private:
@@ -245,9 +244,9 @@ struct Strided1DCyclicIndexer
     {
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
-        return static_cast<size_t>(offset + (gid % size) * step);
+        return offset + (gid % size) * step;
     }
 
 private:
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
index f654087281..7cca7c7b5d 100644
--- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -238,6 +238,30 @@ template <typename indT = std::ptrdiff_t> class CIndexer_vector
         }
         return;
     }
+
+    template <class ShapeTy, class StridesTy>
+    void get_left_rolled_displacement(indT i,
+                                      ShapeTy shape,
+                                      StridesTy stride,
+                                      StridesTy shifts,
+                                      indT &disp) const
+    {
+        indT i_ = i;
+        indT d = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            // assumes si > shifts[dim] >= 0
+            const indT shifted_r =
+                (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]);
+            d += shifted_r * stride[dim];
+            i_ = q;
+        }
+        const indT shifted_r =
+            (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]);
+        disp = d + shifted_r * stride[0];
+    }
 };
 
 /*
@@ -909,6 +933,55 @@ contract_iter4(vecT shape,
                            out_strides3, disp3, out_strides4, disp4);
 }
 
+/*
+    For purposes of iterating over elements of an array with  `shape` and
+    strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
+    may modify memory and returns the new length of the array.
+
+    The new shape and new strides `(new_shape, new_strides)` are such that
+    iterating over them will traverse the same elements in the same order,
+    possibly with reduced dimensionality.
+ */
+template <class ShapeTy, class StridesTy>
+int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
+{
+    if (nd < 2)
+        return nd;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        if (strides[i] < 0) {
+            contractable = false;
+        }
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str = strides[i + 1];
+            StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;
+
+            if (jump == str) {
+                changed = true;
+                shape[i] *= shape[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides[j] = strides[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape[j] = shape[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+
+    return nd_;
+}
+
 } // namespace strides
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index 0dd63fe973..d37967daef 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -205,24 +205,14 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
     auto const &strides_vector = mask.get_strides_vector();
 
     using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_strides;
-    py::ssize_t offset(0);
+    shT compact_shape;
+    shT compact_strides;
 
     int mask_nd = mask.get_ndim();
     int nd = mask_nd;
 
-    dpctl::tensor::py_internal::simplify_iteration_space_1(
-        nd, shape, strides_vector, simplified_shape, simplified_strides,
-        offset);
-
-    if (nd == 1 && simplified_strides[0] == 1) {
-        auto fn = (use_i32)
-                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
-                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
-
-        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
-    }
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
 
     // Strided implementation
     auto strided_fn =
@@ -232,7 +222,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
 
     using dpctl::tensor::offset_utils::device_allocate_and_pack;
     const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_strides);
+        exec_q, host_task_events, compact_shape, compact_strides);
     py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
     if (shape_strides == nullptr) {
         sycl::event::wait(host_task_events);
@@ -253,7 +243,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
     dependent_events.insert(dependent_events.end(), depends.begin(),
                             depends.end());
 
-    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset,
+    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
                                   shape_strides, cumsum_data, dependent_events);
 
     sycl::event::wait(host_task_events);
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
index 8edf982b16..7114d87c47 100644
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
@@ -60,7 +60,6 @@ static copy_for_reshape_fn_ptr_t
 std::pair<sycl::event, sycl::event>
 copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
                              dpctl::tensor::usm_ndarray dst,
-                             py::ssize_t shift,
                              sycl::queue exec_q,
                              const std::vector<sycl::event> &depends)
 {
@@ -109,7 +108,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
     if (src_nelems == 1) {
         // handle special case of 1-element array
         int src_elemsize = src.get_elemsize();
-        char *src_data = src.get_data();
+        const char *src_data = src.get_data();
         char *dst_data = dst.get_data();
         sycl::event copy_ev =
             exec_q.copy<char>(src_data, dst_data, src_elemsize);
@@ -146,7 +145,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
     }
     sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
-    char *src_data = src.get_data();
+    const char *src_data = src.get_data();
     char *dst_data = dst.get_data();
 
     std::vector<sycl::event> all_deps(depends.size() + 1);
@@ -154,7 +153,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
     all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
 
     sycl::event copy_for_reshape_event =
-        fn(exec_q, shift, src_nelems, src_nd, dst_nd, shape_strides, src_data,
+        fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data,
            dst_data, all_deps);
 
     auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
index 09caddf824..32d41fc159 100644
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
@@ -40,7 +40,6 @@ namespace py_internal
 extern std::pair<sycl::event, sycl::event>
 copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
                              dpctl::tensor::usm_ndarray dst,
-                             py::ssize_t shift,
                              sycl::queue exec_q,
                              const std::vector<sycl::event> &depends = {});
 
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
new file mode 100644
index 0000000000..eee129932f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
@@ -0,0 +1,419 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "copy_for_roll.hpp"
+#include "dpctl4pybind11.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/type_dispatch.hpp"
+#include <pybind11/pybind11.h>
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_for_roll_ndshift_strided_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_roll_strided_fn_ptr_t
+    copy_for_roll_strided_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_contig_fn_ptr_t
+    copy_for_roll_contig_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_ndshift_strided_fn_ptr_t
+    copy_for_roll_ndshift_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll_1d(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             py::ssize_t shift,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        py::ssize_t range =
+            static_cast<py::ssize_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    const bool is_src_c_contig = src.is_c_contiguous();
+    const bool is_src_f_contig = src.is_f_contiguous();
+
+    const bool is_dst_c_contig = dst.is_c_contiguous();
+    const bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const bool both_c_contig = is_src_c_contig && is_dst_c_contig;
+    const bool both_f_contig = is_src_f_contig && is_dst_f_contig;
+
+    // normalize shift parameter to be 0 <= offset < src_nelems
+    size_t offset =
+        (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (both_c_contig || both_f_contig) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
+                   zero_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape_ptr;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data,
+                   dst_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto fn = copy_for_roll_strided_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+
+    py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
+    if (shape_strides == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_for_roll_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task(
+            [shape_strides, ctx]() { sycl::free(shape_strides, ctx); });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll_nd(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             const std::vector<py::ssize_t> &shifts,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    if (static_cast<size_t>(src_nd) != shifts.size()) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires shifts to "
+            "contain an integral shift for each array dimension.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        py::ssize_t range =
+            static_cast<py::ssize_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // check for compatible queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    std::vector<py::ssize_t> normalized_shifts{};
+    normalized_shifts.reserve(src_nd);
+
+    for (int i = 0; i < src_nd; ++i) {
+        // normalize shift parameter to be 0 <= offset < dim
+        py::ssize_t dim = src_shape_ptr[i];
+        size_t offset =
+            (shifts[i] > 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim);
+
+        normalized_shifts.push_back(offset);
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+    auto const &common_shape = src.get_shape_vector();
+
+    constexpr py::ssize_t src_offset = 0;
+    constexpr py::ssize_t dst_offset = 0;
+
+    auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, common_shape, src_strides, dst_strides,
+        normalized_shifts);
+
+    py::ssize_t *shape_strides_shifts = std::get<0>(ptr_size_event_tuple);
+    if (shape_strides_shifts == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_for_roll_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([shape_strides_shifts, ctx]() {
+            sycl::free(shape_strides_shifts, ctx);
+        });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+void init_copy_for_roll_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory;
+
+    DispatchVectorBuilder<copy_for_roll_strided_fn_ptr_t,
+                          CopyForRollStridedFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory;
+    DispatchVectorBuilder<copy_for_roll_contig_fn_ptr_t,
+                          CopyForRollContigFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory;
+    DispatchVectorBuilder<copy_for_roll_ndshift_strided_fn_ptr_t,
+                          CopyForRollNDShiftFactory, num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.hpp b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
new file mode 100644
index 0000000000..0c00710e11
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
@@ -0,0 +1,58 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll_1d(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             py::ssize_t shift,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll_nd(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             const std::vector<py::ssize_t> &shifts,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_roll_dispatch_vectors();
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
index 4097e76178..0e7bc195e9 100644
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -210,7 +210,7 @@ void copy_numpy_ndarray_into_usm_ndarray(
         simplified_dst_strides.push_back(1);
     }
 
-    // Minumum and maximum element offsets for source np.ndarray
+    // Minimum and maximum element offsets for source np.ndarray
     py::ssize_t npy_src_min_nelem_offset(src_offset);
     py::ssize_t npy_src_max_nelem_offset(src_offset);
     for (int i = 0; i < nd; ++i) {
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp
index 946c36ad26..d04c9c9ed2 100644
--- a/dpctl/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpctl/tensor/libtensor/source/device_support_queries.cpp
@@ -51,7 +51,7 @@ std::string _default_device_fp_type(sycl::device d)
 
 std::string _default_device_int_type(sycl::device)
 {
-    return "l"; // code for numpy.dtype('long') to be consisent
+    return "l"; // code for numpy.dtype('long') to be consistent
                 // with NumPy's default integer type across
                 // platforms.
 }
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
index 2fb2d6078e..31f5250f8a 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -369,6 +369,45 @@ void simplify_iteration_space_4(
     }
 }
 
+void compact_iteration_space(int &nd,
+                             const py::ssize_t *const &shape,
+                             std::vector<py::ssize_t> const &strides,
+                             // output
+                             std::vector<py::ssize_t> &compact_shape,
+                             std::vector<py::ssize_t> &compact_strides)
+{
+    using dpctl::tensor::strides::compact_iteration;
+    if (nd > 1) {
+        // Compact iteration space to reduce dimensionality
+        // and improve access pattern
+        compact_shape.reserve(nd);
+        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
+        assert(compact_shape.size() == static_cast<size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.insert(std::end(compact_strides), std::begin(strides),
+                               std::end(strides));
+        assert(compact_strides.size() == static_cast<size_t>(nd));
+
+        int contracted_nd =
+            compact_iteration(nd, compact_shape.data(), compact_strides.data());
+        compact_shape.resize(contracted_nd);
+        compact_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        compact_shape.reserve(nd);
+        compact_shape.push_back(shape[0]);
+        assert(compact_shape.size() == static_cast<size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.push_back(strides[0]);
+        assert(compact_strides.size() == static_cast<size_t>(nd));
+    }
+}
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
                                  std::vector<py::ssize_t> const &shape)
 {
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
index 1bd8ff5aa0..275129ad5c 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -90,6 +90,13 @@ void simplify_iteration_space_4(int &,
                                 py::ssize_t &,
                                 py::ssize_t &);
 
+void compact_iteration_space(int &,
+                             const py::ssize_t *const &,
+                             std::vector<py::ssize_t> const &,
+                             // output
+                             std::vector<py::ssize_t> &,
+                             std::vector<py::ssize_t> &);
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
                                  std::vector<py::ssize_t> const &);
 py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp
index 3502a81a0e..13ab268b55 100644
--- a/dpctl/tensor/libtensor/source/sum_reductions.cpp
+++ b/dpctl/tensor/libtensor/source/sum_reductions.cpp
@@ -88,8 +88,11 @@ static sum_reduction_strided_impl_fn_ptr
 
 using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr;
 static sum_reduction_contig_impl_fn_ptr
-    sum_over_axis_contig_atomic_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
+    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static sum_reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
 
 std::pair<sycl::event, sycl::event> py_sum_over_axis(
     dpctl::tensor::usm_ndarray src,
@@ -194,8 +197,32 @@ std::pair<sycl::event, sycl::event> py_sum_over_axis(
         if ((is_src_c_contig && is_dst_c_contig) ||
             (is_src_f_contig && dst_nelems == 1))
         {
-            auto fn = sum_over_axis_contig_atomic_dispatch_table[src_typeid]
-                                                                [dst_typeid];
+            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
+            if (fn != nullptr) {
+                size_t iter_nelems = dst_nelems;
+
+                constexpr py::ssize_t zero_offset = 0;
+
+                sycl::event sum_over_axis_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(),
+                       zero_offset, // iteration_src_offset
+                       zero_offset, // iteration_dst_offset
+                       zero_offset, // reduction_src_offset
+                       depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {sum_over_axis_contig_ev});
+
+                return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
+            }
+        }
+        else if (is_src_f_contig &&
+                 ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+        {
+            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
             if (fn != nullptr) {
                 size_t iter_nelems = dst_nelems;
 
@@ -271,27 +298,58 @@ std::pair<sycl::event, sycl::event> py_sum_over_axis(
                                  iteration_src_offset, iteration_dst_offset);
     }
 
-    if (supports_atomics && (reduction_nd == 1) &&
-        (simplified_reduction_src_strides[0] == 1) && (iteration_nd == 1) &&
-        ((simplified_iteration_shape[0] == 1) ||
-         ((simplified_iteration_dst_strides[0] == 1) &&
-          (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
-           reduction_nelems))))
-    {
-        auto fn =
-            sum_over_axis_contig_atomic_dispatch_table[src_typeid][dst_typeid];
-        if (fn != nullptr) {
-            size_t iter_nelems = dst_nelems;
+    if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
+                 reduction_nelems);
+        }
+        else if (static_cast<size_t>(simplified_reduction_src_strides[0]) ==
+                 iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
+            if (fn != nullptr) {
+                sycl::event sum_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
 
-            sycl::event sum_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(), iteration_src_offset, iteration_dst_offset,
-                   reduction_src_offset, depends);
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {sum_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      sum_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
+            if (fn != nullptr) {
+                sycl::event sum_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
 
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {sum_over_axis_contig_ev});
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {sum_over_axis0_contig_ev});
 
-            return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
+                return std::make_pair(keep_args_event,
+                                      sum_over_axis0_contig_ev);
+            }
         }
     }
 
@@ -451,11 +509,17 @@ void populate_sum_over_axis_dispatch_table(void)
         dtb2;
     dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
 
-    using dpctl::tensor::kernels::SumOverAxisAtomicContigFactory;
+    using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory;
     DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
-                         SumOverAxisAtomicContigFactory, num_types>
+                         SumOverAxis1AtomicContigFactory, num_types>
         dtb3;
-    dtb3.populate_dispatch_table(sum_over_axis_contig_atomic_dispatch_table);
+    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
+                         SumOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
 }
 
 namespace py = pybind11;
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 9b4ba6cdad..691a56b11f 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -37,6 +37,7 @@
 #include "boolean_reductions.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
+#include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 #include "elementwise_functions.hpp"
@@ -68,6 +69,11 @@ using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
 
 using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
+/* =========================== Copy for roll ============================= */
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
 using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
@@ -120,6 +126,7 @@ void init_dispatch_vectors(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_for_reshape_dispatch_vectors();
+    init_copy_for_roll_dispatch_vectors();
     init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_eye_ctor_dispatch_vectors();
@@ -221,11 +228,27 @@ PYBIND11_MODULE(_tensor_impl, m)
     m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
           "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
           "number of elements using underlying 'C'-contiguous order for flat "
+          "traversal. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for flat "
           "traversal with shift. "
           "Returns a tuple of events: (ht_event, comp_event)",
           py::arg("src"), py::arg("dst"), py::arg("shift"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 
+    m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for "
+          "traversal with shifts along each axis. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shifts"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
           "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
           "specified by "
@@ -245,7 +268,7 @@ PYBIND11_MODULE(_tensor_impl, m)
 
     m.def("_copy_numpy_ndarray_into_usm_ndarray",
           &copy_numpy_ndarray_into_usm_ndarray,
-          "Copy fom numpy array `src` into usm_ndarray `dst` synchronously.",
+          "Copy from numpy array `src` into usm_ndarray `dst` synchronously.",
           py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
index 9c800af812..2d2ec96fec 100644
--- a/dpctl/tests/elementwise/test_abs.py
+++ b/dpctl/tests/elementwise/test_abs.py
@@ -76,6 +76,15 @@ def test_abs_usm_type(usm_type):
     assert np.allclose(dpt.asnumpy(Y), expected_Y)
 
 
+def test_abs_types_prop():
+    types = dpt.abs.types_
+    assert types is None
+    types = dpt.abs.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.abs.types_
+
+
 @pytest.mark.parametrize("dtype", _all_dtypes[1:])
 def test_abs_order(dtype):
     q = get_queue_or_skip()
diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
index 891dda5252..2f5fd7c02a 100644
--- a/dpctl/tests/elementwise/test_add.py
+++ b/dpctl/tests/elementwise/test_add.py
@@ -258,6 +258,15 @@ def __sycl_usm_array_interface__(self):
         dpt.add(a, c)
 
 
+def test_add_types_property():
+    types = dpt.add.types_
+    assert types is None
+    types = dpt.add.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.add.types_
+
+
 def test_add_errors():
     get_queue_or_skip()
     try:
diff --git a/dpctl/tests/test_sycl_queue_manager.py b/dpctl/tests/test_sycl_queue_manager.py
index c01dca0139..d640042eb9 100644
--- a/dpctl/tests/test_sycl_queue_manager.py
+++ b/dpctl/tests/test_sycl_queue_manager.py
@@ -1,6 +1,6 @@
 #                      Data Parallel Control (dpctl)
 #
-# Copyright 2020-2022 Intel Corporation
+# Copyright 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -69,8 +69,8 @@ def test_is_in_device_context_inside_nested_device_ctxt_cpu():
     n = cpu.max_compute_units
     n_half = n // 2
     try:
-        d0, d1 = cpu.create_subdevices(partition=[n_half, n - n_half])
-    except Exception:
+        d0, d1 = cpu.create_sub_devices(partition=[n_half, n - n_half])
+    except dpctl.SyclSubDeviceCreationError:
         pytest.skip("Could not create subdevices")
     assert 0 == dpctl.get_num_activated_queues()
     with dpctl.device_context(d0):
diff --git a/dpctl/tests/test_tensor_asarray.py b/dpctl/tests/test_tensor_asarray.py
index f9bc31972c..e73c35ce26 100644
--- a/dpctl/tests/test_tensor_asarray.py
+++ b/dpctl/tests/test_tensor_asarray.py
@@ -71,7 +71,7 @@ def test_asarray_from_numpy():
     assert type(Y) is dpt.usm_ndarray
     assert Y.shape == Xnp.shape
     assert Y.dtype == Xnp.dtype
-    # Fortan contiguous case
+    # Fortran contiguous case
     Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="f4", order="F")
     Y = dpt.asarray(Xnp, usm_type="shared")
     assert type(Y) is dpt.usm_ndarray
diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py
index fc2a0ec8de..403a823324 100644
--- a/dpctl/tests/test_tensor_sum.py
+++ b/dpctl/tests/test_tensor_sum.py
@@ -172,3 +172,18 @@ def test_largish_reduction(arg_dtype, n):
 
     assert dpt.all(dpt.equal(y1, y2))
     assert dpt.all(dpt.equal(y1, n * m))
+
+
+def test_axis0_bug():
+    "gh-1391"
+    get_queue_or_skip()
+
+    sh = (1, 2, 3)
+    a = dpt.arange(sh[0] * sh[1] * sh[2], dtype="i4")
+    a = dpt.reshape(a, sh)
+    aT = dpt.permute_dims(a, (2, 1, 0))
+
+    s = dpt.sum(aT, axis=2)
+    expected = dpt.asarray([[0, 3], [1, 4], [2, 5]])
+
+    assert dpt.all(s == expected)
diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 77edc1f22e..72f5aabebb 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -239,8 +239,9 @@ def test_copy_scalar_with_func(func, shape, dtype):
         X = dpt.usm_ndarray(shape, dtype=dtype)
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
-    Y = np.arange(1, X.size + 1, dtype=dtype).reshape(shape)
-    X.usm_data.copy_from_host(Y.reshape(-1).view("|u1"))
+    Y = np.arange(1, X.size + 1, dtype=dtype)
+    X.usm_data.copy_from_host(Y.view("|u1"))
+    Y.shape = tuple()
     assert func(X) == func(Y)
 
 
@@ -254,8 +255,9 @@ def test_copy_scalar_with_method(method, shape, dtype):
         X = dpt.usm_ndarray(shape, dtype=dtype)
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
-    Y = np.arange(1, X.size + 1, dtype=dtype).reshape(shape)
-    X.usm_data.copy_from_host(Y.reshape(-1).view("|u1"))
+    Y = np.arange(1, X.size + 1, dtype=dtype)
+    X.usm_data.copy_from_host(Y.view("|u1"))
+    Y.shape = tuple()
     assert getattr(X, method)() == getattr(Y, method)()
 
 
@@ -1438,17 +1440,26 @@ def test_real_imag_views():
     n, m = 2, 3
     try:
         X = dpt.usm_ndarray((n, m), "c8")
+        X_scalar = dpt.usm_ndarray((), dtype="c8")
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
     Xnp_r = np.arange(n * m, dtype="f4").reshape((n, m))
     Xnp_i = np.arange(n * m, 2 * n * m, dtype="f4").reshape((n, m))
     Xnp = Xnp_r + 1j * Xnp_i
     X[:] = Xnp
-    assert np.array_equal(dpt.to_numpy(X.real), Xnp.real)
+    X_real = X.real
+    X_imag = X.imag
+    assert np.array_equal(dpt.to_numpy(X_real), Xnp.real)
     assert np.array_equal(dpt.to_numpy(X.imag), Xnp.imag)
+    assert not X_real.flags["C"] and not X_real.flags["F"]
+    assert not X_imag.flags["C"] and not X_imag.flags["F"]
+    assert X_real.strides == X_imag.strides
     assert np.array_equal(dpt.to_numpy(X[1:].real), Xnp[1:].real)
     assert np.array_equal(dpt.to_numpy(X[1:].imag), Xnp[1:].imag)
 
+    X_scalar[...] = complex(n * m, 2 * n * m)
+    assert X_scalar.real and X_scalar.imag
+
 
 @pytest.mark.parametrize(
     "dtype",
@@ -2125,6 +2136,8 @@ def test_flags():
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
     f = x.flags
+    # check comparison with generic types
+    assert f != Ellipsis
     f.__repr__()
     assert f.c_contiguous == f["C"]
     assert f.f_contiguous == f["F"]
@@ -2133,8 +2146,6 @@ def test_flags():
     assert f.forc == f["FORC"]
     assert f.fnc == f["FNC"]
     assert f.writable == f["W"]
-    # check comparison with generic types
-    f == Ellipsis
 
 
 def test_asarray_uint64():
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 9d166226e7..9183226be2 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -1044,6 +1044,19 @@ def test_extract_all_1d():
     res2 = dpt.extract(sel, x)
     assert (dpt.asnumpy(res2) == expected_res).all()
 
+    # test strided case
+    x = dpt.arange(15, dtype="i4")
+    sel_np = np.zeros(15, dtype="?")
+    np.put(sel_np, np.random.choice(sel_np.size, size=7), True)
+    sel = dpt.asarray(sel_np)
+
+    res = x[sel[::-1]]
+    expected_res = dpt.asnumpy(x)[sel_np[::-1]]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel[::-1], x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
 
 def test_extract_all_2d():
     get_queue_or_skip()
@@ -1287,6 +1300,38 @@ def test_nonzero():
     assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()
 
 
+def test_nonzero_f_contig():
+    "See gh-1370"
+    get_queue_or_skip
+
+    mask = dpt.zeros((5, 5), dtype="?", order="F")
+    mask[2, 3] = True
+
+    expected_res = (2, 3)
+    res = dpt.nonzero(mask)
+
+    assert expected_res == res
+    assert mask[res]
+
+
+def test_nonzero_compacting():
+    """See gh-1370.
+    Test with input where dimensionality
+    of iteration space is compacted from 3d to 2d
+    """
+    get_queue_or_skip
+
+    mask = dpt.zeros((5, 5, 5), dtype="?", order="F")
+    mask[3, 2, 1] = True
+    mask_view = mask[..., :3]
+
+    expected_res = (3, 2, 1)
+    res = dpt.nonzero(mask_view)
+
+    assert expected_res == res
+    assert mask_view[res]
+
+
 def test_assign_scalar():
     get_queue_or_skip()
     x = dpt.arange(-5, 5, dtype="i8")
diff --git a/dpctl/tests/test_usm_ndarray_linalg.py b/dpctl/tests/test_usm_ndarray_linalg.py
new file mode 100644
index 0000000000..4023eb8ad7
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_linalg.py
@@ -0,0 +1,48 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip
+
+
+def test_matrix_transpose():
+    get_queue_or_skip()
+
+    X = dpt.reshape(dpt.arange(2 * 3, dtype="i4"), (2, 3))
+    res = dpt.matrix_transpose(X)
+    expected_res = X.mT
+
+    assert expected_res.shape == res.shape
+    assert expected_res.flags["C"] == res.flags["C"]
+    assert expected_res.flags["F"] == res.flags["F"]
+    assert dpt.all(X.mT == res)
+
+
+def test_matrix_transpose_arg_validation():
+    get_queue_or_skip()
+
+    X = dpt.empty(5, dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.matrix_transpose(X)
+
+    X = dict()
+    with pytest.raises(TypeError):
+        dpt.matrix_transpose(X)
+
+    X = dpt.empty((5, 5), dtype="i4")
+    assert isinstance(dpt.matrix_transpose(X), dpt.usm_ndarray)
diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py
index 1cee5e6c8f..6152a15aae 100644
--- a/dpctl/tests/test_usm_ndarray_manipulation.py
+++ b/dpctl/tests/test_usm_ndarray_manipulation.py
@@ -648,6 +648,19 @@ def test_roll_2d(data):
     assert_array_equal(Ynp, dpt.asnumpy(Y))
 
 
+def test_roll_validation():
+    get_queue_or_skip()
+
+    X = dict()
+    with pytest.raises(TypeError):
+        dpt.roll(X)
+
+    X = dpt.empty((1, 2, 3))
+    shift = ((2, 3, 1), (1, 2, 3))
+    with pytest.raises(ValueError):
+        dpt.roll(X, shift=shift, axis=(0, 1, 2))
+
+
 def test_concat_incorrect_type():
     Xnp = np.ones((2, 2))
     pytest.raises(TypeError, dpt.concat)
diff --git a/libsyclinterface/include/dpctl_data_types.h b/libsyclinterface/include/dpctl_data_types.h
index c8c812b86d..2e644f1327 100644
--- a/libsyclinterface/include/dpctl_data_types.h
+++ b/libsyclinterface/include/dpctl_data_types.h
@@ -81,21 +81,21 @@ typedef signed int ssize_t;
 #endif /* _MSC_VER */
 
 /*!
-    @brief Represents tha largest possible value of a 64 bit signed integer.
+    @brief Represents the largest possible value of a 64 bit signed integer.
 */
 #if !defined(INT64_MAX)
 #define INT64_MAX 9223372036854775807LL
 #endif
 
 /*!
-    @brief Represents tha smallest possible value of a 64 bit signed integer.
+    @brief Represents the smallest possible value of a 64 bit signed integer.
 */
 #if !defined(INT64_MIN)
 #define INT64_MIN ((-INT64_MAX) - 1)
 #endif
 
 /*!
-    @brief Represents tha largest possible value of a 64bit unsigned integer.
+    @brief Represents the largest possible value of a 64bit unsigned integer.
 */
 #if !defined(UINT64_MAX)
 #define UINT64_MAX 0xffffffffffffffffULL
diff --git a/libsyclinterface/include/dpctl_device_selection.hpp b/libsyclinterface/include/dpctl_device_selection.hpp
index 6aa4e69ec7..9da0072ab1 100644
--- a/libsyclinterface/include/dpctl_device_selection.hpp
+++ b/libsyclinterface/include/dpctl_device_selection.hpp
@@ -1,4 +1,5 @@
-//===-- dpctl_device_selection.h - Device selector class declar. --*-C++-*- =//
+//===-- dpctl_device_selection.h -
+//                              Device selector class declaration --*-C++-*- =//
 //
 //
 //                      Data Parallel Control (dpctl)
diff --git a/libsyclinterface/include/dpctl_sycl_device_interface.h b/libsyclinterface/include/dpctl_sycl_device_interface.h
index 50c4d1f1f2..3050d7449f 100644
--- a/libsyclinterface/include/dpctl_sycl_device_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_device_interface.h
@@ -233,19 +233,6 @@ DPCTL_API
 __dpctl_keep size_t *
 DPCTLDevice_GetMaxWorkItemSizes3d(__dpctl_keep const DPCTLSyclDeviceRef DRef);
 
-/*!
- * @brief Wrapper for deprecated get_info<info::device::max_work_item_sizes>().
- *
- * @param    DRef           Opaque pointer to a ``sycl::device``
- * @return   Returns the valid result if device exists else returns NULL.
- * @ingroup DeviceInterface
- */
-#if __cplusplus || (defined(__GNUC__) && __GNUC__ > 10)
-[[deprecated("Use DPCTLDevice_WorkItemSizes3d instead")]]
-#endif
-DPCTL_API __dpctl_keep size_t *
-DPCTLDevice_GetMaxWorkItemSizes(__dpctl_keep const DPCTLSyclDeviceRef DRef);
-
 /*!
  * @brief Wrapper for get_info<info::device::max_work_group_size>().
  *
diff --git a/libsyclinterface/include/dpctl_sycl_device_selector_interface.h b/libsyclinterface/include/dpctl_sycl_device_selector_interface.h
index c1cd5fcd5c..f439f3281a 100644
--- a/libsyclinterface/include/dpctl_sycl_device_selector_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_device_selector_interface.h
@@ -19,7 +19,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This header declares C contructors for the various SYCL device_selector
+/// This header declares C constructors for the various SYCL device_selector
 /// classes.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/libsyclinterface/include/dpctl_sycl_event_interface.h b/libsyclinterface/include/dpctl_sycl_event_interface.h
index a49d17f12e..6bb4d0412c 100644
--- a/libsyclinterface/include/dpctl_sycl_event_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_event_interface.h
@@ -44,7 +44,7 @@ DPCTL_C_EXTERN_C_BEGIN
 DPCTL_DECLARE_VECTOR(Event)
 
 /*!
- * @brief A wrapper for ``sycl::event`` contructor to construct a new event.
+ * @brief A wrapper for ``sycl::event`` constructor to construct a new event.
  *
  * @return   An opaque DPCTLSyclEventRef pointer wrapping a ``sycl::event``.
  * @ingroup EventInterface
diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h
index 8dd07280d2..1c5e53a395 100644
--- a/libsyclinterface/include/dpctl_sycl_queue_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h
@@ -42,8 +42,8 @@ DPCTL_C_EXTERN_C_BEGIN
  */
 
 /*!
- * @brief A wrapper for sycl::queue contructor to construct a new queue from the
- * provided context, device, async handler and propertis bit flags.
+ * @brief A wrapper for sycl::queue constructor to construct a new queue from
+ * the provided context, device, async handler and propertis bit flags.
  *
  * @param    CRef           An opaque pointer to a sycl::context.
  * @param    DRef           An opaque pointer to a sycl::device
@@ -362,7 +362,7 @@ DPCTL_API
 size_t DPCTLQueue_Hash(__dpctl_keep const DPCTLSyclQueueRef QRef);
 
 /*!
- * @brief C-API wraper for ``sycl::queue::submit_barrier()``.
+ * @brief C-API wrapper for ``sycl::queue::submit_barrier()``.
  *
  * @param    QRef    An opaque pointer to the ``sycl::queue``.
  * @return   An opaque pointer to the ``sycl::event`` returned by the
@@ -373,7 +373,7 @@ __dpctl_give DPCTLSyclEventRef
 DPCTLQueue_SubmitBarrier(__dpctl_keep const DPCTLSyclQueueRef QRef);
 
 /*!
- * @brief C-API wraper for ``sycl::queue::submit_barrier(event_vector)``.
+ * @brief C-API wrapper for ``sycl::queue::submit_barrier(event_vector)``.
  *
  * @param    QRef    An opaque pointer to the ``sycl::queue``.
  * @param    DepEvents     List of dependent DPCTLSyclEventRef objects (events)
diff --git a/libsyclinterface/include/dpctl_vector.h b/libsyclinterface/include/dpctl_vector.h
index fea49925ea..792368bbc3 100644
--- a/libsyclinterface/include/dpctl_vector.h
+++ b/libsyclinterface/include/dpctl_vector.h
@@ -83,8 +83,8 @@ DPCTL_C_EXTERN_C_BEGIN
     /*!                                                                        \
        @brief Returns the element at the specified index.                      \
        @param VRef Opaque pointer to a vector.                                 \
-       @param index The index postion of the element to be returned.           \
-       @return The element at the specified postion, if the index position is  \
+       @param index The index position of the element to be returned.          \
+       @return The element at the specified position, if the index position is \
                out of bounds then a nullptr is returned.                       \
      */                                                                        \
     DPCTL_API                                                                  \
diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
index bcdd46a9fd..73dfcc7e11 100644
--- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
@@ -266,12 +266,6 @@ DPCTLDevice_GetMaxWorkItemSizes3d(__dpctl_keep const DPCTLSyclDeviceRef DRef)
     return DPCTLDevice__GetMaxWorkItemSizes<3>(DRef);
 }
 
-__dpctl_keep size_t *
-DPCTLDevice_GetMaxWorkItemSizes(__dpctl_keep const DPCTLSyclDeviceRef DRef)
-{
-    return DPCTLDevice__GetMaxWorkItemSizes<3>(DRef);
-}
-
 size_t
 DPCTLDevice_GetMaxWorkGroupSize(__dpctl_keep const DPCTLSyclDeviceRef DRef)
 {
diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
index 387374a032..ce318ce37e 100644
--- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
@@ -143,6 +143,9 @@ std::unique_ptr<property_list> create_property_list(int properties)
         propList =
             std::make_unique<property_list>(sycl::property::queue::in_order());
     }
+    else {
+        propList = std::make_unique<property_list>();
+    }
 
     if (_prop) {
         std::stringstream ss;
@@ -185,7 +188,7 @@ DPCTLQueue_Create(__dpctl_keep const DPCTLSyclContextRef CRef,
     }
     auto propList = create_property_list(properties);
 
-    if (propList && handler) {
+    if (handler) {
         try {
             auto Queue = new queue(*ctx, *dev, DPCTL_AsyncErrorHandler(handler),
                                    *propList);
@@ -194,26 +197,9 @@ DPCTLQueue_Create(__dpctl_keep const DPCTLSyclContextRef CRef,
             error_handler(e, __FILE__, __func__, __LINE__);
         }
     }
-    else if (properties) {
-        try {
-            auto Queue = new queue(*ctx, *dev, *propList);
-            q = wrap<queue>(Queue);
-        } catch (std::exception const &e) {
-            error_handler(e, __FILE__, __func__, __LINE__);
-        }
-    }
-    else if (handler) {
-        try {
-            auto Queue =
-                new queue(*ctx, *dev, DPCTL_AsyncErrorHandler(handler));
-            q = wrap<queue>(Queue);
-        } catch (std::exception const &e) {
-            error_handler(e, __FILE__, __func__, __LINE__);
-        }
-    }
     else {
         try {
-            auto Queue = new queue(*ctx, *dev);
+            auto Queue = new queue(*ctx, *dev, *propList);
             q = wrap<queue>(Queue);
         } catch (std::exception const &e) {
             error_handler(e, __FILE__, __func__, __LINE__);
diff --git a/libsyclinterface/tests/test_sycl_device_interface.cpp b/libsyclinterface/tests/test_sycl_device_interface.cpp
index 7e92c8c9de..dd20c738df 100644
--- a/libsyclinterface/tests/test_sycl_device_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_device_interface.cpp
@@ -175,14 +175,6 @@ TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkItemSizes3d)
     EXPECT_NO_FATAL_FAILURE(DPCTLSize_t_Array_Delete(sizes));
 }
 
-TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkItemSizes)
-{
-    size_t *sizes = nullptr;
-    EXPECT_NO_FATAL_FAILURE(sizes = DPCTLDevice_GetMaxWorkItemSizes(DRef));
-    EXPECT_TRUE(sizes != nullptr);
-    EXPECT_NO_FATAL_FAILURE(DPCTLSize_t_Array_Delete(sizes));
-}
-
 TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkGroupSize)
 {
     size_t n = 0;
@@ -625,10 +617,24 @@ TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemDims)
     ASSERT_TRUE(md == 0);
 }
 
-TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes)
+TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes1d)
+{
+    size_t *sz = nullptr;
+    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes1d(Null_DRef));
+    ASSERT_TRUE(sz == nullptr);
+}
+
+TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes2d)
+{
+    size_t *sz = nullptr;
+    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes2d(Null_DRef));
+    ASSERT_TRUE(sz == nullptr);
+}
+
+TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes3d)
 {
     size_t *sz = nullptr;
-    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes(Null_DRef));
+    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes3d(Null_DRef));
     ASSERT_TRUE(sz == nullptr);
 }
 
diff --git a/scripts/gen_coverage.py b/scripts/gen_coverage.py
index 41b30d00d9..93328f1b85 100644
--- a/scripts/gen_coverage.py
+++ b/scripts/gen_coverage.py
@@ -57,7 +57,7 @@ def run(
         "-DDPCTL_BUILD_CAPI_TESTS=ON",
         "-DDPCTL_COVERAGE_REPORT_OUTPUT_DIR=" + setup_dir,
     ]
-    env = None
+    env = dict()
     if bin_llvm:
         env = {
             "PATH": ":".join((os.environ.get("PATH", ""), bin_llvm)),
diff --git a/scripts/gen_docs.py b/scripts/gen_docs.py
index 377b37f6d8..9e2285a477 100644
--- a/scripts/gen_docs.py
+++ b/scripts/gen_docs.py
@@ -59,7 +59,7 @@ def run(
         cmake_args.append("-DDPCTL_ENABLE_DOXYREST=ON")
         cmake_args.append("-DDoxyrest_DIR=" + doxyrest_dir)
 
-    env = None
+    env = dict()
     if bin_llvm:
         env = {
             "PATH": ":".join((os.environ.get("PATH", ""), bin_llvm)),