diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index f717f4b309..d0ad1c8205 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -18,7 +18,7 @@ jobs: strategy: matrix: - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v3 with: @@ -63,7 +63,7 @@ jobs: strategy: matrix: - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11'] env: conda-bld: C:\Miniconda\conda-bld\win-64\ steps: @@ -102,7 +102,7 @@ jobs: strategy: matrix: - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11'] experimental: [false] runner: [ubuntu-20.04] continue-on-error: ${{ matrix.experimental }} @@ -185,7 +185,7 @@ jobs: shell: cmd /C CALL {0} strategy: matrix: - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11'] experimental: [false] runner: [windows-latest] continue-on-error: ${{ matrix.experimental }} @@ -300,7 +300,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11'] steps: - name: Download artifact uses: actions/download-artifact@v3 @@ -324,7 +324,7 @@ jobs: runs-on: windows-latest strategy: matrix: - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11'] steps: - name: Download artifact uses: actions/download-artifact@v3 @@ -608,7 +608,11 @@ jobs: echo "Array API standard conformance tests failed to run for dpctl=$PACKAGE_VERSION." exit 1 fi + - name: Output API summary + shell: bash -l {0} + run: echo "::notice ${{ env.MESSAGE }}" - name: Post result to PR + if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }} uses: mshick/add-pr-comment@v1 with: message: | diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml index 768d958e02..84bbed4622 100644 --- a/.github/workflows/generate-docs.yml +++ b/.github/workflows/generate-docs.yml @@ -49,7 +49,7 @@ jobs: if: ${{ !github.event.pull_request || github.event.action != 'closed' }} shell: bash -l {0} run: | - pip install numpy cython setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics + pip install numpy cython setuptools scikit-build cmake sphinx"<7.2" sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics - name: Checkout repo uses: actions/checkout@v3 with: @@ -76,7 +76,7 @@ jobs: mv ../cmake-install/docs/docs ~/docs git clean -dfx - name: Publish docs - if: ${{ github.ref == 'refs/heads/master' }} + if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.ref == 'refs/heads/master' }} shell: bash -l {0} run: | git remote add tokened_docs https://IntelPython:${{ secrets.GITHUB_TOKEN }}@github.com/IntelPython/dpctl.git @@ -91,8 +91,15 @@ jobs: git config --global user.email 'github-actions[doc-deploy-bot]@users.noreply.github.com' git commit -m "Latest docs." git push tokened_docs gh-pages + - name: Save built docs as an artifact + if: ${{ github.event.pull_request && github.event.pull_request.head.repo.fork && github.event.action != 'closed'}} + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGE_NAME }} rendered documentation + path: ~/docs + - name: Publish pull-request docs - if: ${{ github.event.pull_request && github.event.action != 'closed' }} + if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action != 'closed' }} env: PR_NUM: ${{ github.event.number }} shell: bash -l {0} @@ -111,7 +118,7 @@ jobs: git commit -m "Docs for pull request ${PR_NUM}" git push tokened_docs gh-pages - name: Unpublish pull-request docs - if: ${{ github.event.pull_request && github.event.action == 'closed' }} + if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action == 'closed' }} env: PR_NUM: ${{ github.event.number }} shell: bash -l {0} @@ -128,7 +135,7 @@ jobs: git commit -m "Removing docs for closed pull request ${PR_NUM}" git push tokened_docs gh-pages - name: Comment with URL to published pull-request docs - if: ${{ github.event.pull_request && github.event.action != 'closed' }} + if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action != 'closed' }} env: PR_NUM: ${{ github.event.number }} uses: mshick/add-pr-comment@v1 @@ -138,7 +145,7 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} repo-token-user-login: 'github-actions[bot]' - name: Comment with URL about removal of PR docs - if: ${{ github.event.pull_request && github.event.action == 'closed' }} + if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action == 'closed' }} env: PR_NUM: ${{ github.event.number }} uses: mshick/add-pr-comment@v1 diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml index 55d57653f4..d12747f3a9 100644 --- a/.github/workflows/os-llvm-sycl-build.yml +++ b/.github/workflows/os-llvm-sycl-build.yml @@ -97,18 +97,18 @@ jobs: - name: Install system components shell: bash -l {0} run: | - sudo apt-get install ninja-build libtinfo5 + sudo apt-get install libtinfo5 - name: Setup Python uses: actions/setup-python@v4 with: - python-version: '3.9' + python-version: '3.11' architecture: x64 - name: Install dpctl dependencies shell: bash -l {0} run: | - pip install numpy cython setuptools pytest scikit-build cmake + pip install numpy cython setuptools pytest scikit-build cmake ninja - name: Checkout repo uses: actions/checkout@v3 diff --git a/cmake/FindDpctl.cmake b/cmake/FindDpctl.cmake index e917aaa194..fe75f3767f 100644 --- a/cmake/FindDpctl.cmake +++ b/cmake/FindDpctl.cmake @@ -17,15 +17,8 @@ # if(NOT Dpctl_FOUND) - set(_find_extra_args) - if(Dpctl_FIND_REQUIRED) - list(APPEND _find_extra_args REQUIRED) - endif() - if(Dpctl_FIND_QUIET) - list(APPEND _find_extra_args QUIET) - endif() - find_package(PythonInterp ${_find_extra_args}) - find_package(PythonLibs ${_find_extra_args}) + find_package(Python 3.9 REQUIRED + COMPONENTS Interpreter Development.Module) if(PYTHON_EXECUTABLE) execute_process(COMMAND "${PYTHON_EXECUTABLE}" diff --git a/conda-recipe/run_test.bat b/conda-recipe/run_test.bat index 833ddfd21c..85cac031e7 100644 --- a/conda-recipe/run_test.bat +++ b/conda-recipe/run_test.bat @@ -3,8 +3,8 @@ "%PYTHON%" -c "import dpctl; print(dpctl.__version__)" if errorlevel 1 exit 1 -"%PYTHON%" -c "import dpctl; dpctl.lsplatform()" +"%PYTHON%" -m dpctl -f if errorlevel 1 exit 1 -python -m pytest -q -p no:faulthandler -ra --disable-warnings --pyargs dpctl -vv +python -m pytest -q -ra --disable-warnings --pyargs dpctl -vv if errorlevel 1 exit 1 diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh index 4b3566d7bd..63ae7996cc 100644 --- a/conda-recipe/run_test.sh +++ b/conda-recipe/run_test.sh @@ -3,5 +3,5 @@ set -e ${PYTHON} -c "import dpctl; print(dpctl.__version__)" -${PYTHON} -c "import dpctl; dpctl.lsplatform(verbosity=2)" -${PYTHON} -m pytest -q -ra --disable-warnings -p no:faulthandler --cov dpctl --cov-report term-missing --pyargs dpctl -vv +${PYTHON} -m dpctl -f +${PYTHON} -m pytest -q -ra --disable-warnings --cov dpctl --cov-report term-missing --pyargs dpctl -vv diff --git a/dpctl/_sycl_context.pyx b/dpctl/_sycl_context.pyx index 9a9f7c5277..71c2fb7591 100644 --- a/dpctl/_sycl_context.pyx +++ b/dpctl/_sycl_context.pyx @@ -114,7 +114,7 @@ cdef class SyclContext(_SyclContext): ctx = dpctl.SyclContext() print(ctx.get_devices()) - - Invoking the constuctor with a specific filter string that creates a + - Invoking the constructor with a specific filter string that creates a context for the device corresponding to the filter string. :Example: @@ -127,7 +127,7 @@ cdef class SyclContext(_SyclContext): d = ctx.get_devices()[0] assert(d.is_gpu) - - Invoking the constuctor with a :class:`dpctl.SyclDevice` object + - Invoking the constructor with a :class:`dpctl.SyclDevice` object creates a context for that device. :Example: @@ -141,7 +141,7 @@ cdef class SyclContext(_SyclContext): d = ctx.get_devices()[0] assert(d.is_gpu) - - Invoking the constuctor with a list of :class:`dpctl.SyclDevice` + - Invoking the constructor with a list of :class:`dpctl.SyclDevice` objects creates a common context for all the devices. This constructor call is especially useful when creation a context for multiple sub-devices. @@ -159,7 +159,7 @@ cdef class SyclContext(_SyclContext): ctx = dpctl.SyclContext(sub_devices) assert(len(ctx.get_devices) == len(sub_devices)) - - Invoking the constuctor with a named ``PyCapsule`` with name + - Invoking the constructor with a named ``PyCapsule`` with name **"SyclContextRef"** that carries a pointer to a ``sycl::context`` object. The capsule will be renamed upon successful consumption to ensure one-time use. A new named capsule can be constructed by @@ -430,7 +430,7 @@ cdef class SyclContext(_SyclContext): return num_devs else: raise ValueError( - "An error was encountered quering the number of devices " + "An error was encountered querying the number of devices " "associated with this context" ) diff --git a/dpctl/_sycl_device.pyx b/dpctl/_sycl_device.pyx index 6814dec677..50ce94d94c 100644 --- a/dpctl/_sycl_device.pyx +++ b/dpctl/_sycl_device.pyx @@ -1163,7 +1163,7 @@ cdef class SyclDevice(_SyclDevice): def create_sub_devices(self, **kwargs): """create_sub_devices(partition=parition_spec) Creates a list of sub-devices by partitioning a root device based on the - provided partion specifier. + provided partition specifier. A partition specifier must be provided using a "partition" keyword argument. Possible values for the specifier are: an int, a diff --git a/dpctl/_sycl_event.pyx b/dpctl/_sycl_event.pyx index 90165b4547..34576a2ef7 100644 --- a/dpctl/_sycl_event.pyx +++ b/dpctl/_sycl_event.pyx @@ -119,7 +119,7 @@ cdef class SyclEvent(_SyclEvent): # Create a default SyclEvent e = dpctl.SyclEvent() - - Invoking the constuctor with a named ``PyCapsule`` with name + - Invoking the constructor with a named ``PyCapsule`` with name **"SyclEventRef"** that carries a pointer to a ``sycl::event`` object. The capsule will be renamed upon successful consumption to ensure one-time use. A new named capsule can be constructed by diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index dbf0ae8385..6acf3396e1 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -246,7 +246,7 @@ cdef class SyclQueue(_SyclQueue): # create a queue for each sub-device using the common context queues = [dpctl.SyclQueue(ctx, sub_d) for sub_d in sub_devices] - - Invoking the constuctor with a named ``PyCapsule`` with the name + - Invoking the constructor with a named ``PyCapsule`` with the name **"SyclQueueRef"** that carries a pointer to a ``sycl::queue`` object. The capsule will be renamed upon successful consumption to ensure one-time use. A new named capsule can be constructed by diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 0137549251..322272df2d 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -37,14 +37,14 @@ class SyclTimer: q = dpctl.SyclQueue(property='enable_profiling') # create the timer - miliseconds_sc = 1e-3 - timer = dpctl.SyclTimer(time_scale = miliseconds_sc) + milliseconds_sc = 1e-3 + timer = dpctl.SyclTimer(time_scale = milliseconds_sc) # use the timer with timer(queue=q): code_block - # retrieve elapsed times in miliseconds + # retrieve elapsed times in milliseconds sycl_dt, wall_dt = timer.dt Remark: diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 1434da1f32..b529c41599 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -742,12 +742,12 @@ class usm_memory : public py::object return nullptr; } - auto convertor = + auto converter = ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj(); py::object res; try { - res = convertor(py::handle(o)); + res = converter(py::handle(o)); } catch (const py::error_already_set &e) { return nullptr; } diff --git a/dpctl/memory/_memory.pyx b/dpctl/memory/_memory.pyx index 4ea19bfba0..e0900f870e 100644 --- a/dpctl/memory/_memory.pyx +++ b/dpctl/memory/_memory.pyx @@ -92,7 +92,7 @@ cdef void copy_via_host(void *dest_ptr, SyclQueue dest_queue, void *src_ptr, SyclQueue src_queue, size_t nbytes): """ Copies `nbytes` bytes from `src_ptr` USM memory to - `dest_ptr` USM memory using host as the intemediary. + `dest_ptr` USM memory using host as the intermediary. This is useful when `src_ptr` and `dest_ptr` are bound to incompatible SYCL contexts. diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index ca83b8350b..49f25aef6a 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -37,6 +37,7 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 7438fb8a67..ad51689f3f 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -60,6 +60,7 @@ from dpctl.tensor._device import Device from dpctl.tensor._dlpack import from_dlpack from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take +from dpctl.tensor._linear_algebra_functions import matrix_transpose from dpctl.tensor._manipulation_functions import ( broadcast_arrays, broadcast_to, @@ -199,6 +200,7 @@ "tril", "triu", "where", + "matrix_transpose", "all", "any", "dtype", diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 3eae29f057..bc1b071460 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -56,7 +56,7 @@ def _copy_to_numpy(ary): def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None): "Copies numpy array `np_ary` into a new usm_ndarray" - # This may peform a copy to meet stated requirements + # This may perform a copy to meet stated requirements Xnp = np.require(np_ary, requirements=["A", "E"]) alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None) dt = Xnp.dtype @@ -517,6 +517,11 @@ def copy(usm_ary, order="K"): - "K": match the layout of `usm_ary` as closely as possible. """ + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() if not isinstance(usm_ary, dpt.usm_ndarray): return TypeError( f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" @@ -585,11 +590,11 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True): return TypeError( f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" ) - if not isinstance(order, str) or order not in ["A", "C", "F", "K"]: + if len(order) == 0 or order[0] not in "KkAaCcFf": raise ValueError( - "Unrecognized value of the order keyword. " - "Recognized values are 'A', 'C', 'F', or 'K'" + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." ) + order = order[0].upper() ary_dtype = usm_ary.dtype target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue) if not dpt.can_cast(ary_dtype, target_dtype, casting=casting): diff --git a/dpctl/tensor/_device.py b/dpctl/tensor/_device.py index 63e9cee80f..30afacb435 100644 --- a/dpctl/tensor/_device.py +++ b/dpctl/tensor/_device.py @@ -96,7 +96,7 @@ def sycl_context(self): @property def sycl_device(self): """ - :class:`dpctl.SyclDevice` targed by this :class:`.Device`. + :class:`dpctl.SyclDevice` targeted by this :class:`.Device`. """ return self.sycl_queue_.sycl_device diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index 002b0ef5ec..7d46a3d814 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -29,6 +29,7 @@ from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK from ._type_utils import ( _acceptance_fn_default, + _all_data_types, _find_buf_dtype, _find_buf_dtype2, _to_device_supported_dtype, @@ -44,6 +45,7 @@ def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs): self.__name__ = "UnaryElementwiseFunc" self.name_ = name self.result_type_resolver_fn_ = result_type_resolver_fn + self.types_ = None self.unary_fn_ = unary_dp_impl_fn self.__doc__ = docs @@ -53,6 +55,18 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + @property + def types(self): + types = self.types_ + if not types: + types = [] + for dt1 in _all_data_types(True, True): + dt2 = self.result_type_resolver_fn_(dt1) + if dt2: + types.append(f"{dt1.char}->{dt2.char}") + self.types_ = types + return types + def __call__(self, x, out=None, order="K"): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") @@ -363,6 +377,7 @@ def __init__( self.__name__ = "BinaryElementwiseFunc" self.name_ = name self.result_type_resolver_fn_ = result_type_resolver_fn + self.types_ = None self.binary_fn_ = binary_dp_impl_fn self.binary_inplace_fn_ = binary_inplace_fn self.__doc__ = docs @@ -377,6 +392,20 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + @property + def types(self): + types = self.types_ + if not types: + types = [] + _all_dtypes = _all_data_types(True, True) + for dt1 in _all_dtypes: + for dt2 in _all_dtypes: + dt3 = self.result_type_resolver_fn_(dt1, dt2) + if dt3: + types.append(f"{dt1.char}{dt2.char}->{dt3.char}") + self.types_ = types + return types + def __call__(self, o1, o2, out=None, order="K"): if order not in ["K", "C", "F", "A"]: order = "K" @@ -439,7 +468,7 @@ def __call__(self, o1, o2, out=None, order="K"): o1_dtype = _get_dtype(o1, sycl_dev) o2_dtype = _get_dtype(o2, sycl_dev) if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)): - raise ValueError("Operands of unsupported types") + raise ValueError("Operands have unsupported data types") o1_dtype, o2_dtype = _resolve_weak_types(o1_dtype, o2_dtype, sycl_dev) @@ -469,7 +498,7 @@ def __call__(self, o1, o2, out=None, order="K"): if out.shape != res_shape: raise ValueError( "The shape of input and output arrays are inconsistent. " - f"Expected output shape is {o1_shape}, got {out.shape}" + f"Expected output shape is {res_shape}, got {out.shape}" ) if res_dt != out.dtype: diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index fe85a183ba..8e2abee837 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -1628,7 +1628,7 @@ _subtract_docstring_ = """ subtract(x1, x2, out=None, order='K') -Calculates the difference bewteen each element `x1_i` of the input +Calculates the difference between each element `x1_i` of the input array `x1` and the respective element `x2_i` of the input array `x2`. Args: diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py new file mode 100644 index 0000000000..fd2c58b08a --- /dev/null +++ b/dpctl/tensor/_linear_algebra_functions.py @@ -0,0 +1,48 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl.tensor as dpt + + +def matrix_transpose(x): + """matrix_transpose(x) + + Transposes the innermost two dimensions of `x`, where `x` is a + 2-dimensional matrix or a stack of 2-dimensional matrices. + + To convert from a 1-dimensional array to a 2-dimensional column + vector, use x[:, dpt.newaxis]. + + Args: + x (usm_ndarray): + Input array with shape (..., m, n). + + Returns: + usm_ndarray: + Array with shape (..., n, m). + """ + + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + if x.ndim < 2: + raise ValueError( + "dpctl.tensor.matrix_transpose requires array to have" + "at least 2 dimensions" + ) + + return x.mT diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 9406e386af..cb54556ed2 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -15,7 +15,8 @@ # limitations under the License. -from itertools import chain, product, repeat +import operator +from itertools import chain, repeat import numpy as np from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple @@ -315,7 +316,7 @@ def broadcast_to(X, shape): raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") # Use numpy.broadcast_to to check the validity of the input - # parametr 'shape'. Raise ValueError if 'X' is not compatible + # parameter 'shape'. Raise ValueError if 'X' is not compatible # with 'shape' according to NumPy's broadcasting rules. new_array = np.broadcast_to( np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape @@ -426,10 +427,11 @@ def roll(X, shift, axis=None): if not isinstance(X, dpt.usm_ndarray): raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") if axis is None: + shift = operator.index(shift) res = dpt.empty( X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue ) - hev, _ = ti._copy_usm_ndarray_for_reshape( + hev, _ = ti._copy_usm_ndarray_for_roll_1d( src=X, dst=res, shift=shift, sycl_queue=X.sycl_queue ) hev.wait() @@ -438,31 +440,20 @@ def roll(X, shift, axis=None): broadcasted = np.broadcast(shift, axis) if broadcasted.ndim > 1: raise ValueError("'shift' and 'axis' should be scalars or 1D sequences") - shifts = {ax: 0 for ax in range(X.ndim)} + shifts = [ + 0, + ] * X.ndim for sh, ax in broadcasted: shifts[ax] += sh - rolls = [((np.s_[:], np.s_[:]),)] * X.ndim - for ax, offset in shifts.items(): - offset %= X.shape[ax] or 1 - if offset: - # (original, result), (original, result) - rolls[ax] = ( - (np.s_[:-offset], np.s_[offset:]), - (np.s_[-offset:], np.s_[:offset]), - ) + exec_q = X.sycl_queue res = dpt.empty( - X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue + X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q ) - hev_list = [] - for indices in product(*rolls): - arr_index, res_index = zip(*indices) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=X[arr_index], dst=res[res_index], sycl_queue=X.sycl_queue - ) - hev_list.append(hev) - - dpctl.SyclEvent.wait_for(hev_list) + ht_e, _ = ti._copy_usm_ndarray_for_roll_nd( + src=X, dst=res, shifts=shifts, sycl_queue=exec_q + ) + ht_e.wait() return res @@ -550,7 +541,6 @@ def _concat_axis_None(arrays): hev, _ = ti._copy_usm_ndarray_for_reshape( src=src_, dst=res[fill_start:fill_end], - shift=0, sycl_queue=exec_q, ) fill_start = fill_end @@ -757,7 +747,7 @@ def iinfo(dtype): Returns: iinfo_object: - An object with the followign attributes + An object with the following attributes * bits: int number of bits occupied by the data type * max: int diff --git a/dpctl/tensor/_print.py b/dpctl/tensor/_print.py index 30bb353df7..97e485a7e7 100644 --- a/dpctl/tensor/_print.py +++ b/dpctl/tensor/_print.py @@ -148,7 +148,7 @@ def set_print_options( suppress (bool, optional): If `True,` numbers equal to zero in the current precision will print as zero. Default: `False`. - nanstr (str, optional): String used to repesent nan. + nanstr (str, optional): String used to represent nan. Raises `TypeError` if nanstr is not a string. Default: `"nan"`. infstr (str, optional): String used to represent infinity. diff --git a/dpctl/tensor/_reshape.py b/dpctl/tensor/_reshape.py index ac4a04cac4..b363c063de 100644 --- a/dpctl/tensor/_reshape.py +++ b/dpctl/tensor/_reshape.py @@ -165,7 +165,7 @@ def reshape(X, shape, order="C", copy=None): ) if order == "C": hev, _ = _copy_usm_ndarray_for_reshape( - src=X, dst=flat_res, shift=0, sycl_queue=X.sycl_queue + src=X, dst=flat_res, sycl_queue=X.sycl_queue ) hev.wait() else: diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi index 4f45d57391..4289284251 100644 --- a/dpctl/tensor/_slicing.pxi +++ b/dpctl/tensor/_slicing.pxi @@ -104,7 +104,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int): Give basic slicing index `ind` and array layout information produce a 5-tuple (resulting_shape, resulting_strides, resulting_offset, advanced_ind, resulting_advanced_ind_pos) - used to contruct a view into underlying array over which advanced + used to construct a view into underlying array over which advanced indexing, if any, is to be performed. Raises IndexError for invalid index `ind`. @@ -201,7 +201,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int): raise TypeError if ellipses_count > 1: raise IndexError( - "an index can only have a sinlge ellipsis ('...')") + "an index can only have a single ellipsis ('...')") if axes_referenced > len(shape): raise IndexError( "too many indices for an array, array is " diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi index bc7349a5e6..4f12b989dc 100644 --- a/dpctl/tensor/_stride_utils.pxi +++ b/dpctl/tensor/_stride_utils.pxi @@ -55,7 +55,7 @@ cdef int _from_input_shape_strides( nelems - Number of elements in array min_disp = min( dot(strides, index), index for shape) max_disp = max( dor(strides, index), index for shape) - contig = enumation for array contiguity + contig = enumeration for array contiguity Returns: 0 on success, error code otherwise. On success pointers point to allocated arrays, Otherwise they are set to NULL diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 83e5e84759..ba18600135 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -52,6 +52,15 @@ cdef class InternalUSMArrayError(Exception): pass +cdef object _as_zero_dim_ndarray(object usm_ary): + "Convert size-1 array to NumPy 0d array" + mem_view = dpmem.as_usm_memory(usm_ary) + host_buf = mem_view.copy_to_host() + view = host_buf.view(usm_ary.dtype) + view.shape = tuple() + return view + + cdef class usm_ndarray: """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \ offset=0, order="C", buffer_ctor_kwargs=dict(), \ @@ -416,7 +425,7 @@ cdef class usm_ndarray: cdef char *ary_ptr = NULL if (not isinstance(self.base_, dpmem._memory._Memory)): raise InternalUSMArrayError( - "Invalid instance of usm_ndarray ecountered. " + "Invalid instance of usm_ndarray encountered. " "Private field base_ has an unexpected type {}.".format( type(self.base_) ) @@ -557,7 +566,7 @@ cdef class usm_ndarray: elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS): return _f_contig_strides(self.nd_, self.shape_) else: - raise ValueError("Inconsitent usm_ndarray data") + raise ValueError("Inconsistent usm_ndarray data") @property def flags(self): @@ -644,7 +653,7 @@ cdef class usm_ndarray: @property def T(self): - """ Returns tranposed array for 2D array, raises `ValueError` + """ Returns transposed array for 2D array, raises `ValueError` otherwise. """ if self.nd_ == 2: @@ -662,7 +671,7 @@ cdef class usm_ndarray: """ if self.nd_ < 2: raise ValueError( - "array.mT requires array to have at least 2-dimensons." + "array.mT requires array to have at least 2 dimensions." ) return _m_transpose(self) @@ -840,9 +849,7 @@ cdef class usm_ndarray: def __bool__(self): if self.size == 1: - mem_view = dpmem.as_usm_memory(self) - host_buf = mem_view.copy_to_host() - view = host_buf.view(self.dtype) + view = _as_zero_dim_ndarray(self) return view.__bool__() if self.size == 0: @@ -857,9 +864,7 @@ cdef class usm_ndarray: def __float__(self): if self.size == 1: - mem_view = dpmem.as_usm_memory(self) - host_buf = mem_view.copy_to_host() - view = host_buf.view(self.dtype) + view = _as_zero_dim_ndarray(self) return view.__float__() raise ValueError( @@ -868,9 +873,7 @@ cdef class usm_ndarray: def __complex__(self): if self.size == 1: - mem_view = dpmem.as_usm_memory(self) - host_buf = mem_view.copy_to_host() - view = host_buf.view(self.dtype) + view = _as_zero_dim_ndarray(self) return view.__complex__() raise ValueError( @@ -879,9 +882,7 @@ cdef class usm_ndarray: def __int__(self): if self.size == 1: - mem_view = dpmem.as_usm_memory(self) - host_buf = mem_view.copy_to_host() - view = host_buf.view(self.dtype) + view = _as_zero_dim_ndarray(self) return view.__int__() raise ValueError( @@ -1216,14 +1217,14 @@ cdef usm_ndarray _real_view(usm_ndarray ary): offset_elems = ary.get_offset() * 2 r = usm_ndarray.__new__( usm_ndarray, - _make_int_tuple(ary.nd_, ary.shape_), + _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(), dtype=_make_typestr(r_typenum_), strides=tuple(2 * si for si in ary.strides), buffer=ary.base_, offset=offset_elems, order=('C' if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F') ) - r.flags_ = ary.flags_ + r.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE) r.array_namespace_ = ary.array_namespace_ return r @@ -1248,14 +1249,14 @@ cdef usm_ndarray _imag_view(usm_ndarray ary): offset_elems = 2 * ary.get_offset() + 1 r = usm_ndarray.__new__( usm_ndarray, - _make_int_tuple(ary.nd_, ary.shape_), + _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(), dtype=_make_typestr(r_typenum_), strides=tuple(2 * si for si in ary.strides), buffer=ary.base_, offset=offset_elems, order=('C' if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F') ) - r.flags_ = ary.flags_ + r.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE) r.array_namespace_ = ary.array_namespace_ return r @@ -1375,8 +1376,8 @@ cdef api object UsmNDArray_MakeSimpleFromMemory( QRef: DPCTLSyclQueueRef associated with the allocation offset: distance between element with zero multi-index and the start of allocation - oder: Memory layout of the array. Use 'C' for C-contiguous or - row-major layout; 'F' for F-contiguous or column-major layout + order: Memory layout of the array. Use 'C' for C-contiguous or + row-major layout; 'F' for F-contiguous or column-major layout Returns: Created usm_ndarray instance """ diff --git a/dpctl/tensor/include/dlpack/README.md b/dpctl/tensor/include/dlpack/README.md index 02c594c0fe..2c22e9aa8d 100644 --- a/dpctl/tensor/include/dlpack/README.md +++ b/dpctl/tensor/include/dlpack/README.md @@ -4,4 +4,4 @@ The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remot The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/e2bdd3bee8cb6501558042633fa59144cc8b7f5f/include/dlpack/dlpack.h -License file was retrived from https://github.com/dmlc/dlpack/blob/main/LICENSE +License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE diff --git a/dpctl/tensor/include/dlpack/dlpack.h b/dpctl/tensor/include/dlpack/dlpack.h index 6d51801123..672448d1c6 100644 --- a/dpctl/tensor/include/dlpack/dlpack.h +++ b/dpctl/tensor/include/dlpack/dlpack.h @@ -168,7 +168,7 @@ typedef struct { * `byte_offset` field should be used to point to the beginning of the data. * * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, - * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * TVM, perhaps others) do not adhere to this 256 byte alignment requirement * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed * (after which this note will be updated); at the moment it is recommended * to not rely on the data pointer being correctly aligned. diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index bb2ddc5ad6..43c546860b 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -424,7 +424,6 @@ typedef size_t (*mask_positions_strided_impl_fn_ptr_t)( size_t, const char *, int, - py::ssize_t, const py::ssize_t *, char *, std::vector const &); @@ -434,7 +433,6 @@ size_t mask_positions_strided_impl(sycl::queue q, size_t n_elems, const char *mask, int nd, - py::ssize_t input_offset, const py::ssize_t *shape_strides, char *cumsum, std::vector const &depends = {}) @@ -444,7 +442,7 @@ size_t mask_positions_strided_impl(sycl::queue q, cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); size_t wg_size = 128; - StridedIndexer strided_indexer{nd, input_offset, shape_strides}; + StridedIndexer strided_indexer{nd, 0, shape_strides}; NonZeroIndicator non_zero_indicator{}; sycl::event comp_ev = diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index f1e63ccc60..e5aaa34903 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -56,9 +56,6 @@ class copy_cast_contig_kernel; template class copy_cast_from_host_kernel; -template -class copy_for_reshape_generic_kernel; - template class Caster { public: @@ -118,7 +115,7 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy` to `dstTy`. - Both arrays have array dimensionality specied via argument `nd`. The + Both arrays have array dimensionality specified via argument `nd`. The `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the first `nd` elements encode common shape, second `nd` elements contain strides of `src` array, and the trailing `nd` elements contain strides of `dst` array. @@ -244,25 +241,26 @@ class ContigCopyFunctor if (base + n_vecs * vec_sz * sgSize < nelems && sgSize == max_sgSize) { - using src_ptrT = - sycl::multi_ptr; - using dst_ptrT = - sycl::multi_ptr; sycl::vec src_vec; sycl::vec dst_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - src_vec = - sg.load(src_ptrT(&src_p[base + it * sgSize])); + auto src_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>( + &src_p[base + it * sgSize]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>( + &dst_p[base + it * sgSize]); + + src_vec = sg.load(src_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; k++) { dst_vec[k] = fn(src_vec[k]); } - sg.store(dst_ptrT(&dst_p[base + it * sgSize]), - dst_vec); + sg.store(dst_multi_ptr, dst_vec); } } else { @@ -629,27 +627,24 @@ struct CopyAndCastFromHostFactory // =============== Copying for reshape ================== // +template +class copy_for_reshape_generic_kernel; + template class GenericCopyForReshapeFunctor { private: - py::ssize_t offset = 0; - py::ssize_t size = 1; - // USM array of size 2*(src_nd + dst_nd) - // [ src_shape; src_strides; dst_shape; dst_strides ] - Ty *src_p = nullptr; + const Ty *src_p = nullptr; Ty *dst_p = nullptr; SrcIndexerT src_indexer_; DstIndexerT dst_indexer_; public: - GenericCopyForReshapeFunctor(py::ssize_t shift, - py::ssize_t nelems, - char *src_ptr, + GenericCopyForReshapeFunctor(const char *src_ptr, char *dst_ptr, SrcIndexerT src_indexer, DstIndexerT dst_indexer) - : offset(shift), size(nelems), src_p(reinterpret_cast(src_ptr)), + : src_p(reinterpret_cast(src_ptr)), dst_p(reinterpret_cast(dst_ptr)), src_indexer_(src_indexer), dst_indexer_(dst_indexer) { @@ -657,45 +652,36 @@ class GenericCopyForReshapeFunctor void operator()(sycl::id<1> wiid) const { - py::ssize_t this_src_offset = src_indexer_(wiid.get(0)); - const Ty *in = src_p + this_src_offset; - - py::ssize_t shifted_wiid = - (static_cast(wiid.get(0)) + offset) % size; - shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size; + const py::ssize_t src_offset = src_indexer_(wiid.get(0)); + const py::ssize_t dst_offset = dst_indexer_(wiid.get(0)); - py::ssize_t this_dst_offset = dst_indexer_(shifted_wiid); - - Ty *out = dst_p + this_dst_offset; - *out = *in; + dst_p[dst_offset] = src_p[src_offset]; } }; // define function type typedef sycl::event (*copy_for_reshape_fn_ptr_t)( sycl::queue, - py::ssize_t, // shift - size_t, // num_elements - int, - int, // src_nd, dst_nd + size_t, // num_elements + int, // src_nd + int, // dst_nd py::ssize_t *, // packed shapes and strides - char *, // src_data_ptr + const char *, // src_data_ptr char *, // dst_data_ptr const std::vector &); /*! * @brief Function to copy content of array while reshaping. * - * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * Submits a kernel to perform a copy `dst[unravel_index(i, * dst.shape)] = src[unravel_undex(i, src.shape)]`. * * @param q The execution queue where kernel is submitted. - * @param shift The shift in flat indexing. * @param nelems The number of elements to copy * @param src_nd Array dimension of the source array * @param dst_nd Array dimension of the destination array * @param packed_shapes_and_strides Kernel accessible USM array of size - * `2*src_nd + 2*dst_nd` with contant `[src_shape, src_strides, dst_shape, + * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape, * dst_strides]`. * @param src_p Typeless USM pointer to the buffer of the source array * @param dst_p Typeless USM pointer to the buffer of the destination array @@ -708,31 +694,40 @@ typedef sycl::event (*copy_for_reshape_fn_ptr_t)( template sycl::event copy_for_reshape_generic_impl(sycl::queue q, - py::ssize_t shift, size_t nelems, int src_nd, int dst_nd, py::ssize_t *packed_shapes_and_strides, - char *src_p, + const char *src_p, char *dst_p, const std::vector &depends) { dpctl::tensor::type_utils::validate_type_for_device(q); sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { - StridedIndexer src_indexer{ - src_nd, 0, - const_cast(packed_shapes_and_strides)}; - StridedIndexer dst_indexer{ - dst_nd, 0, - const_cast(packed_shapes_and_strides + - (2 * src_nd))}; cgh.depends_on(depends); - cgh.parallel_for>( + + // packed_shapes_and_strides: + // USM array of size 2*(src_nd + dst_nd) + // [ src_shape; src_strides; dst_shape; dst_strides ] + + const py::ssize_t *src_shape_and_strides = + const_cast(packed_shapes_and_strides); + + const py::ssize_t *dst_shape_and_strides = + const_cast(packed_shapes_and_strides + + (2 * src_nd)); + + StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides}; + StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides}; + + using KernelName = + copy_for_reshape_generic_kernel; + + cgh.parallel_for( sycl::range<1>(nelems), GenericCopyForReshapeFunctor( - shift, nelems, src_p, dst_p, src_indexer, dst_indexer)); + src_p, dst_p, src_indexer, dst_indexer)); }); return copy_for_reshape_ev; @@ -752,6 +747,387 @@ template struct CopyForReshapeGenericFactory } }; +// ================== Copying for roll ================== // + +/*! @brief Functor to cyclically roll global_id to the left */ +struct LeftRolled1DTransformer +{ + LeftRolled1DTransformer(size_t offset, size_t size) + : offset_(offset), size_(size) + { + } + + size_t operator()(size_t gid) const + { + const size_t shifted_gid = + ((gid < offset_) ? gid + size_ - offset_ : gid - offset_); + return shifted_gid; + } + +private: + size_t offset_ = 0; + size_t size_ = 1; +}; + +/*! @brief Indexer functor to compose indexer and transformer */ +template struct CompositionIndexer +{ + CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {} + + auto operator()(size_t gid) const + { + return f_(t_(gid)); + } + +private: + IndexerT f_; + TransformerT t_; +}; + +/*! @brief Indexer functor to find offset for nd-shifted indices lifted from + * iteration id */ +struct RolledNDIndexer +{ + RolledNDIndexer(int nd, + const py::ssize_t *shape, + const py::ssize_t *strides, + const py::ssize_t *ndshifts, + py::ssize_t starting_offset) + : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts), + starting_offset_(starting_offset) + { + } + + py::ssize_t operator()(size_t gid) const + { + return compute_offset(gid); + } + +private: + int nd_ = -1; + const py::ssize_t *shape_ = nullptr; + const py::ssize_t *strides_ = nullptr; + const py::ssize_t *ndshifts_ = nullptr; + py::ssize_t starting_offset_ = 0; + + py::ssize_t compute_offset(py::ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd_); + py::ssize_t relative_offset_(0); + _ind.get_left_rolled_displacement( + gid, + shape_, // shape ptr + strides_, // strides ptr + ndshifts_, // shifts ptr + relative_offset_); + return starting_offset_ + relative_offset_; + } +}; + +template +class copy_for_roll_strided_kernel; + +template +class StridedCopyForRollFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + StridedCopyForRollFunctor(const Ty *src_ptr, + Ty *dst_ptr, + SrcIndexerT src_indexer, + DstIndexerT dst_indexer) + : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const size_t gid = wiid.get(0); + + const py::ssize_t src_offset = src_indexer_(gid); + const py::ssize_t dst_offset = dst_indexer_(gid); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( + sycl::queue, + size_t, // shift + size_t, // num_elements + int, // common_nd + const py::ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + py::ssize_t, // src_offset + char *, // dst_data_ptr + py::ssize_t, // dst_offset + const std::vector &); + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param nd Array dimensionality of the destination and source arrays + * @param packed_shapes_and_strides Kernel accessible USM array + * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of first element of src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of first element of dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event +copy_for_roll_strided_impl(sycl::queue q, + size_t shift, + size_t nelems, + int nd, + const py::ssize_t *packed_shapes_and_strides, + const char *src_p, + py::ssize_t src_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 3 * nd + // [ common_shape; src_strides; dst_strides ] + + StridedIndexer src_indexer{nd, src_offset, packed_shapes_and_strides}; + LeftRolled1DTransformer left_roll_transformer{shift, nelems}; + + using CompositeIndexerT = + CompositionIndexer; + + CompositeIndexerT rolled_src_indexer(src_indexer, + left_roll_transformer); + + UnpackedStridedIndexer dst_indexer{nd, dst_offset, + packed_shapes_and_strides, + packed_shapes_and_strides + 2 * nd}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, rolled_src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +// define function type +typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)( + sycl::queue, + size_t, // shift + size_t, // num_elements + const char *, // src_data_ptr + py::ssize_t, // src_offset + char *, // dst_data_ptr + py::ssize_t, // dst_offset + const std::vector &); + +template class copy_for_roll_contig_kernel; + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of the start of array src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of the start of array dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_contig_impl(sycl::queue q, + size_t shift, + size_t nelems, + const char *src_p, + py::ssize_t src_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + NoOpIndexer src_indexer{}; + LeftRolled1DTransformer roller{shift, nelems}; + + CompositionIndexer + left_rolled_src_indexer{src_indexer, roller}; + NoOpIndexer dst_indexer{}; + + using KernelName = copy_for_roll_contig_kernel; + + const Ty *src_tp = reinterpret_cast(src_p) + src_offset; + Ty *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor< + Ty, CompositionIndexer, + NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer, + dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template struct CopyForRollStridedFactory +{ + fnT get() + { + fnT f = copy_for_roll_strided_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template struct CopyForRollContigFactory +{ + fnT get() + { + fnT f = copy_for_roll_contig_impl; + return f; + } +}; + +template +class copy_for_roll_ndshift_strided_kernel; + +// define function type +typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( + sycl::queue, + size_t, // num_elements + int, // common_nd + const py::ssize_t *, // packed shape, strides, shifts + const char *, // src_data_ptr + py::ssize_t, // src_offset + char *, // dst_data_ptr + py::ssize_t, // dst_offset + const std::vector &); + +template +sycl::event copy_for_roll_ndshift_strided_impl( + sycl::queue q, + size_t nelems, + int nd, + const py::ssize_t *packed_shapes_and_strides_and_shifts, + const char *src_p, + py::ssize_t src_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides_and_shifts: + // USM array of size 4 * nd + // [ common_shape; src_strides; dst_strides; shifts ] + + const py::ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts; + const py::ssize_t *src_strides_ptr = + packed_shapes_and_strides_and_shifts + nd; + const py::ssize_t *dst_strides_ptr = + packed_shapes_and_strides_and_shifts + 2 * nd; + const py::ssize_t *shifts_ptr = + packed_shapes_and_strides_and_shifts + 3 * nd; + + RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, shifts_ptr, + src_offset}; + + UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr, + dst_strides_ptr}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template struct CopyForRollNDShiftFactory +{ + fnT get() + { + fnT f = copy_for_roll_ndshift_strided_impl; + return f; + } +}; + } // namespace copy_and_cast } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 855d5479c1..797a7f2534 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -65,9 +65,6 @@ struct UnaryContigFunctor if constexpr (UnaryOperatorT::is_constant::value) { // value of operator is known to be a known constant constexpr resT const_val = UnaryOperatorT::constant_value; - using out_ptrT = - sycl::multi_ptr; auto sg = ndit.get_sub_group(); std::uint8_t sgSize = sg.get_local_range()[0]; @@ -80,8 +77,11 @@ struct UnaryContigFunctor sycl::vec res_vec(const_val); #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + sg.store(out_multi_ptr, res_vec); } } else { @@ -94,13 +94,6 @@ struct UnaryContigFunctor else if constexpr (UnaryOperatorT::supports_sg_loadstore::value && UnaryOperatorT::supports_vec::value) { - using in_ptrT = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; - auto sg = ndit.get_sub_group(); std::uint16_t sgSize = sg.get_local_range()[0]; std::uint16_t max_sgSize = sg.get_max_local_range()[0]; @@ -113,10 +106,16 @@ struct UnaryContigFunctor #pragma unroll for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - x = sg.load(in_ptrT(&in[base + it * sgSize])); + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + x = sg.load(in_multi_ptr); sycl::vec res_vec = op(x); - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -141,23 +140,23 @@ struct UnaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (maxsgSize == sgSize)) { - using in_ptrT = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = sg.load(in_ptrT(&in[base + it * sgSize])); + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg_vec = sg.load(in_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; ++k) { arg_vec[k] = op(arg_vec[k]); } - sg.store(out_ptrT(&out[base + it * sgSize]), - arg_vec); + sg.store(out_multi_ptr, arg_vec); } } else { @@ -179,24 +178,24 @@ struct UnaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (maxsgSize == sgSize)) { - using in_ptrT = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = sg.load(in_ptrT(&in[base + it * sgSize])); + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg_vec = sg.load(in_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; ++k) { res_vec[k] = op(arg_vec[k]); } - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -365,28 +364,26 @@ struct BinaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg1_vec = - sg.load(in_ptrT1(&in1[base + it * sgSize])); - arg2_vec = - sg.load(in_ptrT2(&in2[base + it * sgSize])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in1[base + it * sgSize]); + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in2[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg1_vec = sg.load(in1_multi_ptr); + arg2_vec = sg.load(in2_multi_ptr); res_vec = op(arg1_vec, arg2_vec); - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -407,32 +404,30 @@ struct BinaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg1_vec = - sg.load(in_ptrT1(&in1[base + it * sgSize])); - arg2_vec = - sg.load(in_ptrT2(&in2[base + it * sgSize])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in1[base + it * sgSize]); + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in2[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg1_vec = sg.load(in1_multi_ptr); + arg2_vec = sg.load(in2_multi_ptr); #pragma unroll for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { res_vec[vec_id] = op(arg1_vec[vec_id], arg2_vec[vec_id]); } - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -530,22 +525,24 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using res_ptrT = - sycl::multi_ptr; - - const argT1 mat_el = sg.load(in_ptrT1(&mat[base])); - const argT2 vec_el = sg.load(in_ptrT2(&padded_vec[base % n1])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); + + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&res[base]); + + const argT1 mat_el = sg.load(in1_multi_ptr); + const argT2 vec_el = sg.load(in2_multi_ptr); resT res_el = op(mat_el, vec_el); - sg.store(res_ptrT(&res[base]), res_el); + sg.store(out_multi_ptr, res_el); } else { for (size_t k = base + sg.get_local_id()[0]; k < n_elems; @@ -592,22 +589,24 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using res_ptrT = - sycl::multi_ptr; - - const argT2 mat_el = sg.load(in_ptrT2(&mat[base])); - const argT1 vec_el = sg.load(in_ptrT1(&padded_vec[base % n1])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&res[base]); + + const argT2 mat_el = sg.load(in2_multi_ptr); + const argT1 vec_el = sg.load(in1_multi_ptr); resT res_el = op(vec_el, mat_el); - sg.store(res_ptrT(&res[base]), res_el); + sg.store(out_multi_ptr, res_el); } else { for (size_t k = base + sg.get_local_id()[0]; k < n_elems; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index a41029b27c..505a40acc5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -76,24 +76,24 @@ struct BinaryInplaceContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using rhs_ptrT = - sycl::multi_ptr; - using lhs_ptrT = - sycl::multi_ptr; + sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = - sg.load(rhs_ptrT(&rhs[base + it * sgSize])); - res_vec = - sg.load(lhs_ptrT(&lhs[base + it * sgSize])); + auto rhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&rhs[base + it * sgSize]); + auto lhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&lhs[base + it * sgSize]); + + arg_vec = sg.load(rhs_multi_ptr); + res_vec = sg.load(lhs_multi_ptr); op(res_vec, arg_vec); - sg.store(lhs_ptrT(&lhs[base + it * sgSize]), - res_vec); + + sg.store(lhs_multi_ptr, res_vec); } } else { @@ -115,27 +115,25 @@ struct BinaryInplaceContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using rhs_ptrT = - sycl::multi_ptr; - using lhs_ptrT = - sycl::multi_ptr; sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = - sg.load(rhs_ptrT(&rhs[base + it * sgSize])); - res_vec = - sg.load(lhs_ptT(&lhs[base + it * sgSize])); + auto rhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&rhs[base + it * sgSize]); + auto lhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&lhs[base + it * sgSize]); + + arg_vec = sg.load(rhs_multi_ptr); + res_vec = sg.load(lhs_multi_ptr); #pragma unroll for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { op(res_vec[vec_id], arg_vec[vec_id]); } - sg.store(lhs_ptrT(&lhs[base + it * sgSize]), - res_vec); + sg.store(lhs_multi_ptr, res_vec); } } else { @@ -223,19 +221,20 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { - using in_ptrT = - sycl::multi_ptr; - using res_ptrT = - sycl::multi_ptr; + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); - const argT vec_el = sg.load(in_ptrT(&padded_vec[base % n1])); - resT mat_el = sg.load(res_ptrT(&mat[base])); + const argT vec_el = sg.load(in_multi_ptr); + resT mat_el = sg.load(out_multi_ptr); op(mat_el, vec_el); - sg.store(res_ptrT(&mat[base]), mat_el); + sg.store(out_multi_ptr, mat_el); } else { for (size_t k = base + sg.get_local_id()[0]; k < n_elems; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp index b996a6d0ec..e1c23113c6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -115,7 +115,10 @@ template struct Expm1Functor // x, y finite numbers realT cosY_val; - const realT sinY_val = sycl::sincos(y, &cosY_val); + auto cosY_val_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::private_space, + sycl::access::decorated::yes>(&cosY_val); + const realT sinY_val = sycl::sincos(y, cosY_val_multi_ptr); const realT sinhalfY_val = std::sin(y / 2); const realT res_re = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index 268c679f00..32e97df58d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -57,7 +57,12 @@ struct FloorDivideFunctor resT operator()(const argT1 &in1, const argT2 &in2) { - if constexpr (std::is_integral_v || std::is_integral_v) { + if constexpr (std::is_same_v && + std::is_same_v) { + return (in2) ? static_cast(in1) : resT(0); + } + else if constexpr (std::is_integral_v || + std::is_integral_v) { if (in2 == argT2(0)) { return resT(0); } @@ -81,7 +86,16 @@ struct FloorDivideFunctor sycl::vec operator()(const sycl::vec &in1, const sycl::vec &in2) { - if constexpr (std::is_integral_v) { + if constexpr (std::is_same_v && + std::is_same_v) { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + res[i] = (in2[i]) ? static_cast(in1[i]) : resT(0); + } + return res; + } + else if constexpr (std::is_integral_v) { sycl::vec res; #pragma unroll for (int i = 0; i < vec_sz; ++i) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index 808e82539e..9b9fa95061 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -150,11 +150,11 @@ template struct SqrtFunctor int get_normal_scale_double(const double &v) const { - constexpr int float_significant_bits = 53; + constexpr int double_significant_bits = 52; constexpr std::uint64_t exponent_mask = 0x7ff; constexpr int exponent_bias = 1023; const int scale = static_cast( - (sycl::bit_cast(v) >> float_significant_bits) & + (sycl::bit_cast(v) >> double_significant_bits) & exponent_mask); return scale - exponent_bias; } diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index c8aae0a3b9..fd3fcc9681 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -146,9 +146,9 @@ struct ReductionOverGroupWithAtomicFunctor void operator()(sycl::nd_item<1> it) const { - const size_t red_gws_ = it.get_global_range(0) / iter_gws_; - const size_t iter_gid = it.get_global_id(0) / red_gws_; - const size_t reduction_batch_id = get_reduction_batch_id(it); + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t reduction_lid = it.get_local_id(0); const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg @@ -204,14 +204,6 @@ struct ReductionOverGroupWithAtomicFunctor } } } - -private: - size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const - { - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups; - return reduction_batch_id; - } }; typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( @@ -241,6 +233,12 @@ class sum_reduction_seq_strided_krn; template class sum_reduction_seq_contig_krn; +template +class sum_reduction_axis0_over_group_with_atomics_contig_krn; + +template +class sum_reduction_axis1_over_group_with_atomics_contig_krn; + using dpctl::tensor::sycl_utils::choose_workgroup_size; template @@ -344,20 +342,6 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); - if (reduction_groups > 1) { - const size_t &max_wg = - d.get_info(); - - if (reduction_nelems < preferrered_reductions_per_wi * max_wg) { - wg = max_wg; - reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); - reduction_groups = - (reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - } - } - auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; @@ -395,7 +379,7 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( /* @brief Reduce rows in a matrix */ template -sycl::event sum_reduction_over_group_with_atomics_contig_impl( +sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( sycl::queue exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -417,7 +401,7 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<2>(reduction_nelems, sg_sizes); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -463,11 +447,11 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl( RowsIndexerT, NoOpIndexerT>; using ReductionIndexerT = NoOpIndexerT; - RowsIndexerT columns_indexer{ + RowsIndexerT rows_indexer{ 0, static_cast(iter_nelems), static_cast(reduction_nelems)}; NoOpIndexerT result_indexer{}; - InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, result_indexer}; ReductionIndexerT reduction_indexer{}; @@ -481,27 +465,95 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl( (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); - if (reduction_groups > 1) { - const size_t &max_wg = - d.get_info(); - - if (reduction_nelems < preferrered_reductions_per_wi * max_wg) { - wg = max_wg; - reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); - reduction_groups = - (reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - } - } + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = + class sum_reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + }); + + return comp_ev; + } +} + +/* @brief Reduce rows in a matrix */ +template +sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of cols in a matrix + // when reducing over cols) + size_t reduction_nelems, // size of each reduction (length of cols, i.e. + // number of rows) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + using ReductionOpT = sycl::plus; + constexpr resTy identity_val = resTy{0}; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(res_init_ev); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; + + constexpr size_t preferrered_reductions_per_wi = 8; + size_t reductions_per_wi = + (reduction_nelems < preferrered_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferrered_reductions_per_wi; + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_with_atomics_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + using KernelName = + class sum_reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), @@ -558,14 +610,13 @@ struct ReductionOverGroupNoAtomicFunctor void operator()(sycl::nd_item<1> it) const { - - const size_t red_gws_ = it.get_global_range(0) / iter_gws_; - const size_t iter_gid = it.get_global_id(0) / red_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups; const size_t reduction_lid = it.get_local_id(0); const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + // work-items sums over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg // + reduction_lid @@ -642,7 +693,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl( size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + // reduction only requires 1 work-group, can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -1079,7 +1130,25 @@ struct SumOverAxisTempsStridedFactory }; template -struct SumOverAxisAtomicContigFactory +struct SumOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + return dpctl::tensor::kernels:: + sum_reduction_axis1_over_group_with_atomics_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0AtomicContigFactory { fnT get() const { @@ -1087,7 +1156,8 @@ struct SumOverAxisAtomicContigFactory srcTy, dstTy>::is_defined) { return dpctl::tensor::kernels:: - sum_reduction_over_group_with_atomics_contig_impl; + sum_reduction_axis0_over_group_with_atomics_contig_impl; } else { return nullptr; diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp index 67ce2ca1f0..9da5466dbe 100644 --- a/dpctl/tensor/libtensor/include/kernels/where.hpp +++ b/dpctl/tensor/libtensor/include/kernels/where.hpp @@ -100,15 +100,6 @@ class WhereContigFunctor if (base + n_vecs * vec_sz * sgSize < nelems && sgSize == max_sgSize) { - using dst_ptrT = - sycl::multi_ptr; - using x_ptrT = - sycl::multi_ptr; - using cond_ptrT = - sycl::multi_ptr; sycl::vec dst_vec; sycl::vec x1_vec; sycl::vec x2_vec; @@ -117,14 +108,27 @@ class WhereContigFunctor #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { auto idx = base + it * sgSize; - x1_vec = sg.load(x_ptrT(&x1_p[idx])); - x2_vec = sg.load(x_ptrT(&x2_p[idx])); - cond_vec = sg.load(cond_ptrT(&cond_p[idx])); + auto x1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x1_p[idx]); + auto x2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x2_p[idx]); + auto cond_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&cond_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + x1_vec = sg.load(x1_multi_ptr); + x2_vec = sg.load(x2_multi_ptr); + cond_vec = sg.load(cond_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; ++k) { dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k]; } - sg.store(dst_ptrT(&dst_p[idx]), dst_vec); + sg.store(dst_multi_ptr, dst_vec); } } else { diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index ac02d26bf0..19bcf9d0a8 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -91,7 +91,7 @@ device_allocate_and_pack(sycl::queue q, { // memory transfer optimization, use USM-host for temporary speeds up - // tranfer to device, especially on dGPUs + // transfer to device, especially on dGPUs using usm_host_allocatorT = sycl::usm_allocator; using shT = std::vector; @@ -144,12 +144,12 @@ struct StridedIndexer { } - size_t operator()(py::ssize_t gid) const + py::ssize_t operator()(py::ssize_t gid) const { return compute_offset(gid); } - size_t operator()(size_t gid) const + py::ssize_t operator()(size_t gid) const { return compute_offset(static_cast(gid)); } @@ -159,7 +159,7 @@ struct StridedIndexer py::ssize_t starting_offset; py::ssize_t const *shape_strides; - size_t compute_offset(py::ssize_t gid) const + py::ssize_t compute_offset(py::ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; @@ -185,12 +185,12 @@ struct UnpackedStridedIndexer { } - size_t operator()(py::ssize_t gid) const + py::ssize_t operator()(py::ssize_t gid) const { return compute_offset(gid); } - size_t operator()(size_t gid) const + py::ssize_t operator()(size_t gid) const { return compute_offset(static_cast(gid)); } @@ -201,7 +201,7 @@ struct UnpackedStridedIndexer py::ssize_t const *shape; py::ssize_t const *strides; - size_t compute_offset(py::ssize_t gid) const + py::ssize_t compute_offset(py::ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; @@ -223,11 +223,10 @@ struct Strided1DIndexer { } - size_t operator()(size_t gid) const + py::ssize_t operator()(size_t gid) const { // ensure 0 <= gid < size - return static_cast(offset + - std::min(gid, size - 1) * step); + return offset + std::min(gid, size - 1) * step; } private: @@ -245,9 +244,9 @@ struct Strided1DCyclicIndexer { } - size_t operator()(size_t gid) const + py::ssize_t operator()(size_t gid) const { - return static_cast(offset + (gid % size) * step); + return offset + (gid % size) * step; } private: diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp index f654087281..7cca7c7b5d 100644 --- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp +++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp @@ -238,6 +238,30 @@ template class CIndexer_vector } return; } + + template + void get_left_rolled_displacement(indT i, + ShapeTy shape, + StridesTy stride, + StridesTy shifts, + indT &disp) const + { + indT i_ = i; + indT d = 0; + for (int dim = nd; --dim > 0;) { + const indT si = shape[dim]; + const indT q = i_ / si; + const indT r = (i_ - q * si); + // assumes si > shifts[dim] >= 0 + const indT shifted_r = + (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]); + d += shifted_r * stride[dim]; + i_ = q; + } + const indT shifted_r = + (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]); + disp = d + shifted_r * stride[0]; + } }; /* @@ -909,6 +933,55 @@ contract_iter4(vecT shape, out_strides3, disp3, out_strides4, disp4); } +/* + For purposes of iterating over elements of an array with `shape` and + strides `strides` given as pointers `compact_iteration(nd, shape, strides)` + may modify memory and returns the new length of the array. + + The new shape and new strides `(new_shape, new_strides)` are such that + iterating over them will traverse the same elements in the same order, + possibly with reduced dimensionality. + */ +template +int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides) +{ + if (nd < 2) + return nd; + + bool contractable = true; + for (int i = 0; i < nd; ++i) { + if (strides[i] < 0) { + contractable = false; + } + } + + int nd_ = nd; + while (contractable) { + bool changed = false; + for (int i = 0; i + 1 < nd_; ++i) { + StridesTy str = strides[i + 1]; + StridesTy jump = strides[i] - (shape[i + 1] - 1) * str; + + if (jump == str) { + changed = true; + shape[i] *= shape[i + 1]; + for (int j = i; j < nd_; ++j) { + strides[j] = strides[j + 1]; + } + for (int j = i + 1; j + 1 < nd_; ++j) { + shape[j] = shape[j + 1]; + } + --nd_; + break; + } + } + if (!changed) + break; + } + + return nd_; +} + } // namespace strides } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 0dd63fe973..d37967daef 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -205,24 +205,14 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, auto const &strides_vector = mask.get_strides_vector(); using shT = std::vector; - shT simplified_shape; - shT simplified_strides; - py::ssize_t offset(0); + shT compact_shape; + shT compact_strides; int mask_nd = mask.get_ndim(); int nd = mask_nd; - dpctl::tensor::py_internal::simplify_iteration_space_1( - nd, shape, strides_vector, simplified_shape, simplified_strides, - offset); - - if (nd == 1 && simplified_strides[0] == 1) { - auto fn = (use_i32) - ? mask_positions_contig_i32_dispatch_vector[mask_typeid] - : mask_positions_contig_i64_dispatch_vector[mask_typeid]; - - return fn(exec_q, mask_size, mask_data, cumsum_data, depends); - } + dpctl::tensor::py_internal::compact_iteration_space( + nd, shape, strides_vector, compact_shape, compact_strides); // Strided implementation auto strided_fn = @@ -232,7 +222,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, using dpctl::tensor::offset_utils::device_allocate_and_pack; const auto &ptr_size_event_tuple = device_allocate_and_pack( - exec_q, host_task_events, simplified_shape, simplified_strides); + exec_q, host_task_events, compact_shape, compact_strides); py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); if (shape_strides == nullptr) { sycl::event::wait(host_task_events); @@ -253,7 +243,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, dependent_events.insert(dependent_events.end(), depends.begin(), depends.end()); - size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset, + size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data, dependent_events); sycl::event::wait(host_task_events); diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp index 8edf982b16..7114d87c47 100644 --- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp @@ -60,7 +60,6 @@ static copy_for_reshape_fn_ptr_t std::pair copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, - py::ssize_t shift, sycl::queue exec_q, const std::vector &depends) { @@ -109,7 +108,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, if (src_nelems == 1) { // handle special case of 1-element array int src_elemsize = src.get_elemsize(); - char *src_data = src.get_data(); + const char *src_data = src.get_data(); char *dst_data = dst.get_data(); sycl::event copy_ev = exec_q.copy(src_data, dst_data, src_elemsize); @@ -146,7 +145,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, } sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); - char *src_data = src.get_data(); + const char *src_data = src.get_data(); char *dst_data = dst.get_data(); std::vector all_deps(depends.size() + 1); @@ -154,7 +153,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); sycl::event copy_for_reshape_event = - fn(exec_q, shift, src_nelems, src_nd, dst_nd, shape_strides, src_data, + fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data, dst_data, all_deps); auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp index 09caddf824..32d41fc159 100644 --- a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp @@ -40,7 +40,6 @@ namespace py_internal extern std::pair copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, - py::ssize_t shift, sycl::queue exec_q, const std::vector &depends = {}); diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp new file mode 100644 index 0000000000..eee129932f --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp @@ -0,0 +1,419 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "copy_for_roll.hpp" +#include "dpctl4pybind11.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/type_dispatch.hpp" +#include + +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast:: + copy_for_roll_ndshift_strided_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_roll_strided_fn_ptr_t + copy_for_roll_strided_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_contig_fn_ptr_t + copy_for_roll_contig_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_ndshift_strided_fn_ptr_t + copy_for_roll_ndshift_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair +copy_usm_ndarray_for_roll_1d(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + py::ssize_t shift, + sycl::queue exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same number of dimensions."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + py::ssize_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < src_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + const bool is_src_c_contig = src.is_c_contiguous(); + const bool is_src_f_contig = src.is_f_contiguous(); + + const bool is_dst_c_contig = dst.is_c_contiguous(); + const bool is_dst_f_contig = dst.is_f_contiguous(); + + const bool both_c_contig = is_src_c_contig && is_dst_c_contig; + const bool both_f_contig = is_src_f_contig && is_dst_f_contig; + + // normalize shift parameter to be 0 <= offset < src_nelems + size_t offset = + (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (both_c_contig || both_f_contig) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + constexpr py::ssize_t zero_offset = 0; + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape_ptr; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd == 1 && simplified_src_strides[0] == 1 && + simplified_dst_strides[0] == 1) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data, + dst_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto fn = copy_for_roll_strided_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + + py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); + if (shape_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data, + src_offset, dst_data, dst_offset, all_deps); + + auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_for_roll_event); + auto ctx = exec_q.get_context(); + cgh.host_task( + [shape_strides, ctx]() { sycl::free(shape_strides, ctx); }); + }); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +std::pair +copy_usm_ndarray_for_roll_nd(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + const std::vector &shifts, + sycl::queue exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same number of dimensions."); + } + + if (static_cast(src_nd) != shifts.size()) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires shifts to " + "contain an integral shift for each array dimension."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + py::ssize_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < src_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + // check for compatible queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + std::vector normalized_shifts{}; + normalized_shifts.reserve(src_nd); + + for (int i = 0; i < src_nd; ++i) { + // normalize shift parameter to be 0 <= offset < dim + py::ssize_t dim = src_shape_ptr[i]; + size_t offset = + (shifts[i] > 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim); + + normalized_shifts.push_back(offset); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + auto const &common_shape = src.get_shape_vector(); + + constexpr py::ssize_t src_offset = 0; + constexpr py::ssize_t dst_offset = 0; + + auto fn = copy_for_roll_ndshift_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, common_shape, src_strides, dst_strides, + normalized_shifts); + + py::ssize_t *shape_strides_shifts = std::get<0>(ptr_size_event_tuple); + if (shape_strides_shifts == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data, + src_offset, dst_data, dst_offset, all_deps); + + auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_for_roll_event); + auto ctx = exec_q.get_context(); + cgh.host_task([shape_strides_shifts, ctx]() { + sycl::free(shape_strides_shifts, ctx); + }); + }); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +void init_copy_for_roll_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory; + DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.hpp b/dpctl/tensor/libtensor/source/copy_for_roll.hpp new file mode 100644 index 0000000000..0c00710e11 --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_for_roll.hpp @@ -0,0 +1,58 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +copy_usm_ndarray_for_roll_1d(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + py::ssize_t shift, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern std::pair +copy_usm_ndarray_for_roll_nd(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + const std::vector &shifts, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_roll_dispatch_vectors(); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index 4097e76178..0e7bc195e9 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -210,7 +210,7 @@ void copy_numpy_ndarray_into_usm_ndarray( simplified_dst_strides.push_back(1); } - // Minumum and maximum element offsets for source np.ndarray + // Minimum and maximum element offsets for source np.ndarray py::ssize_t npy_src_min_nelem_offset(src_offset); py::ssize_t npy_src_max_nelem_offset(src_offset); for (int i = 0; i < nd; ++i) { diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp index 946c36ad26..d04c9c9ed2 100644 --- a/dpctl/tensor/libtensor/source/device_support_queries.cpp +++ b/dpctl/tensor/libtensor/source/device_support_queries.cpp @@ -51,7 +51,7 @@ std::string _default_device_fp_type(sycl::device d) std::string _default_device_int_type(sycl::device) { - return "l"; // code for numpy.dtype('long') to be consisent + return "l"; // code for numpy.dtype('long') to be consistent // with NumPy's default integer type across // platforms. } diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index 2fb2d6078e..31f5250f8a 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -369,6 +369,45 @@ void simplify_iteration_space_4( } } +void compact_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &compact_shape, + std::vector &compact_strides) +{ + using dpctl::tensor::strides::compact_iteration; + if (nd > 1) { + // Compact iteration space to reduce dimensionality + // and improve access pattern + compact_shape.reserve(nd); + compact_shape.insert(std::begin(compact_shape), shape, shape + nd); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.insert(std::end(compact_strides), std::begin(strides), + std::end(strides)); + assert(compact_strides.size() == static_cast(nd)); + + int contracted_nd = + compact_iteration(nd, compact_shape.data(), compact_strides.data()); + compact_shape.resize(contracted_nd); + compact_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + compact_shape.reserve(nd); + compact_shape.push_back(shape[0]); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.push_back(strides[0]); + assert(compact_strides.size() == static_cast(nd)); + } +} + py::ssize_t _ravel_multi_index_c(std::vector const &mi, std::vector const &shape) { diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp index 1bd8ff5aa0..275129ad5c 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp @@ -90,6 +90,13 @@ void simplify_iteration_space_4(int &, py::ssize_t &, py::ssize_t &); +void compact_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + // output + std::vector &, + std::vector &); + py::ssize_t _ravel_multi_index_c(std::vector const &, std::vector const &); py::ssize_t _ravel_multi_index_f(std::vector const &, diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp index 3502a81a0e..13ab268b55 100644 --- a/dpctl/tensor/libtensor/source/sum_reductions.cpp +++ b/dpctl/tensor/libtensor/source/sum_reductions.cpp @@ -88,8 +88,11 @@ static sum_reduction_strided_impl_fn_ptr using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr; static sum_reduction_contig_impl_fn_ptr - sum_over_axis_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; + sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static sum_reduction_contig_impl_fn_ptr + sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; std::pair py_sum_over_axis( dpctl::tensor::usm_ndarray src, @@ -194,8 +197,32 @@ std::pair py_sum_over_axis( if ((is_src_c_contig && is_dst_c_contig) || (is_src_f_contig && dst_nelems == 1)) { - auto fn = sum_over_axis_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; + auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] + [dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event sum_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {sum_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, sum_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] + [dst_typeid]; if (fn != nullptr) { size_t iter_nelems = dst_nelems; @@ -271,27 +298,58 @@ std::pair py_sum_over_axis( iteration_src_offset, iteration_dst_offset); } - if (supports_atomics && (reduction_nd == 1) && - (simplified_reduction_src_strides[0] == 1) && (iteration_nd == 1) && - ((simplified_iteration_shape[0] == 1) || - ((simplified_iteration_dst_strides[0] == 1) && - (static_cast(simplified_iteration_src_strides[0]) == - reduction_nelems)))) - { - auto fn = - sum_over_axis_contig_atomic_dispatch_table[src_typeid][dst_typeid]; - if (fn != nullptr) { - size_t iter_nelems = dst_nelems; + if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(simplified_reduction_src_strides[0]) == + iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] + [dst_typeid]; + if (fn != nullptr) { + sycl::event sum_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); - sycl::event sum_over_axis_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_src_offset, iteration_dst_offset, - reduction_src_offset, depends); + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {sum_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + sum_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] + [dst_typeid]; + if (fn != nullptr) { + sycl::event sum_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis_contig_ev}); + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {sum_over_axis0_contig_ev}); - return std::make_pair(keep_args_event, sum_over_axis_contig_ev); + return std::make_pair(keep_args_event, + sum_over_axis0_contig_ev); + } } } @@ -451,11 +509,17 @@ void populate_sum_over_axis_dispatch_table(void) dtb2; dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); - using dpctl::tensor::kernels::SumOverAxisAtomicContigFactory; + using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; DispatchTableBuilder + SumOverAxis1AtomicContigFactory, num_types> dtb3; - dtb3.populate_dispatch_table(sum_over_axis_contig_atomic_dispatch_table); + dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); } namespace py = pybind11; diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 9b4ba6cdad..691a56b11f 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -37,6 +37,7 @@ #include "boolean_reductions.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" +#include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" #include "elementwise_functions.hpp" @@ -68,6 +69,11 @@ using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray; using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; +/* =========================== Copy for roll ============================= */ + +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; + /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; @@ -120,6 +126,7 @@ void init_dispatch_vectors(void) using namespace dpctl::tensor::py_internal; init_copy_for_reshape_dispatch_vectors(); + init_copy_for_roll_dispatch_vectors(); init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_eye_ctor_dispatch_vectors(); @@ -221,11 +228,27 @@ PYBIND11_MODULE(_tensor_impl, m) m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " "number of elements using underlying 'C'-contiguous order for flat " + "traversal. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for flat " "traversal with shift. " "Returns a tuple of events: (ht_event, comp_event)", py::arg("src"), py::arg("dst"), py::arg("shift"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for " + "traversal with shifts along each axis. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shifts"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_linspace_step", &usm_ndarray_linear_sequence_step, "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " "specified by " @@ -245,7 +268,7 @@ PYBIND11_MODULE(_tensor_impl, m) m.def("_copy_numpy_ndarray_into_usm_ndarray", ©_numpy_ndarray_into_usm_ndarray, - "Copy fom numpy array `src` into usm_ndarray `dst` synchronously.", + "Copy from numpy array `src` into usm_ndarray `dst` synchronously.", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py index 9c800af812..2d2ec96fec 100644 --- a/dpctl/tests/elementwise/test_abs.py +++ b/dpctl/tests/elementwise/test_abs.py @@ -76,6 +76,15 @@ def test_abs_usm_type(usm_type): assert np.allclose(dpt.asnumpy(Y), expected_Y) +def test_abs_types_prop(): + types = dpt.abs.types_ + assert types is None + types = dpt.abs.types + assert isinstance(types, list) + assert len(types) > 0 + assert types == dpt.abs.types_ + + @pytest.mark.parametrize("dtype", _all_dtypes[1:]) def test_abs_order(dtype): q = get_queue_or_skip() diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py index 891dda5252..2f5fd7c02a 100644 --- a/dpctl/tests/elementwise/test_add.py +++ b/dpctl/tests/elementwise/test_add.py @@ -258,6 +258,15 @@ def __sycl_usm_array_interface__(self): dpt.add(a, c) +def test_add_types_property(): + types = dpt.add.types_ + assert types is None + types = dpt.add.types + assert isinstance(types, list) + assert len(types) > 0 + assert types == dpt.add.types_ + + def test_add_errors(): get_queue_or_skip() try: diff --git a/dpctl/tests/test_sycl_queue_manager.py b/dpctl/tests/test_sycl_queue_manager.py index c01dca0139..d640042eb9 100644 --- a/dpctl/tests/test_sycl_queue_manager.py +++ b/dpctl/tests/test_sycl_queue_manager.py @@ -1,6 +1,6 @@ # Data Parallel Control (dpctl) # -# Copyright 2020-2022 Intel Corporation +# Copyright 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -69,8 +69,8 @@ def test_is_in_device_context_inside_nested_device_ctxt_cpu(): n = cpu.max_compute_units n_half = n // 2 try: - d0, d1 = cpu.create_subdevices(partition=[n_half, n - n_half]) - except Exception: + d0, d1 = cpu.create_sub_devices(partition=[n_half, n - n_half]) + except dpctl.SyclSubDeviceCreationError: pytest.skip("Could not create subdevices") assert 0 == dpctl.get_num_activated_queues() with dpctl.device_context(d0): diff --git a/dpctl/tests/test_tensor_asarray.py b/dpctl/tests/test_tensor_asarray.py index f9bc31972c..e73c35ce26 100644 --- a/dpctl/tests/test_tensor_asarray.py +++ b/dpctl/tests/test_tensor_asarray.py @@ -71,7 +71,7 @@ def test_asarray_from_numpy(): assert type(Y) is dpt.usm_ndarray assert Y.shape == Xnp.shape assert Y.dtype == Xnp.dtype - # Fortan contiguous case + # Fortran contiguous case Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="f4", order="F") Y = dpt.asarray(Xnp, usm_type="shared") assert type(Y) is dpt.usm_ndarray diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index fc2a0ec8de..403a823324 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -172,3 +172,18 @@ def test_largish_reduction(arg_dtype, n): assert dpt.all(dpt.equal(y1, y2)) assert dpt.all(dpt.equal(y1, n * m)) + + +def test_axis0_bug(): + "gh-1391" + get_queue_or_skip() + + sh = (1, 2, 3) + a = dpt.arange(sh[0] * sh[1] * sh[2], dtype="i4") + a = dpt.reshape(a, sh) + aT = dpt.permute_dims(a, (2, 1, 0)) + + s = dpt.sum(aT, axis=2) + expected = dpt.asarray([[0, 3], [1, 4], [2, 5]]) + + assert dpt.all(s == expected) diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index 77edc1f22e..72f5aabebb 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -239,8 +239,9 @@ def test_copy_scalar_with_func(func, shape, dtype): X = dpt.usm_ndarray(shape, dtype=dtype) except dpctl.SyclDeviceCreationError: pytest.skip("No SYCL devices available") - Y = np.arange(1, X.size + 1, dtype=dtype).reshape(shape) - X.usm_data.copy_from_host(Y.reshape(-1).view("|u1")) + Y = np.arange(1, X.size + 1, dtype=dtype) + X.usm_data.copy_from_host(Y.view("|u1")) + Y.shape = tuple() assert func(X) == func(Y) @@ -254,8 +255,9 @@ def test_copy_scalar_with_method(method, shape, dtype): X = dpt.usm_ndarray(shape, dtype=dtype) except dpctl.SyclDeviceCreationError: pytest.skip("No SYCL devices available") - Y = np.arange(1, X.size + 1, dtype=dtype).reshape(shape) - X.usm_data.copy_from_host(Y.reshape(-1).view("|u1")) + Y = np.arange(1, X.size + 1, dtype=dtype) + X.usm_data.copy_from_host(Y.view("|u1")) + Y.shape = tuple() assert getattr(X, method)() == getattr(Y, method)() @@ -1438,17 +1440,26 @@ def test_real_imag_views(): n, m = 2, 3 try: X = dpt.usm_ndarray((n, m), "c8") + X_scalar = dpt.usm_ndarray((), dtype="c8") except dpctl.SyclDeviceCreationError: pytest.skip("No SYCL devices available") Xnp_r = np.arange(n * m, dtype="f4").reshape((n, m)) Xnp_i = np.arange(n * m, 2 * n * m, dtype="f4").reshape((n, m)) Xnp = Xnp_r + 1j * Xnp_i X[:] = Xnp - assert np.array_equal(dpt.to_numpy(X.real), Xnp.real) + X_real = X.real + X_imag = X.imag + assert np.array_equal(dpt.to_numpy(X_real), Xnp.real) assert np.array_equal(dpt.to_numpy(X.imag), Xnp.imag) + assert not X_real.flags["C"] and not X_real.flags["F"] + assert not X_imag.flags["C"] and not X_imag.flags["F"] + assert X_real.strides == X_imag.strides assert np.array_equal(dpt.to_numpy(X[1:].real), Xnp[1:].real) assert np.array_equal(dpt.to_numpy(X[1:].imag), Xnp[1:].imag) + X_scalar[...] = complex(n * m, 2 * n * m) + assert X_scalar.real and X_scalar.imag + @pytest.mark.parametrize( "dtype", @@ -2125,6 +2136,8 @@ def test_flags(): except dpctl.SyclDeviceCreationError: pytest.skip("No SYCL devices available") f = x.flags + # check comparison with generic types + assert f != Ellipsis f.__repr__() assert f.c_contiguous == f["C"] assert f.f_contiguous == f["F"] @@ -2133,8 +2146,6 @@ def test_flags(): assert f.forc == f["FORC"] assert f.fnc == f["FNC"] assert f.writable == f["W"] - # check comparison with generic types - f == Ellipsis def test_asarray_uint64(): diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 9d166226e7..9183226be2 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -1044,6 +1044,19 @@ def test_extract_all_1d(): res2 = dpt.extract(sel, x) assert (dpt.asnumpy(res2) == expected_res).all() + # test strided case + x = dpt.arange(15, dtype="i4") + sel_np = np.zeros(15, dtype="?") + np.put(sel_np, np.random.choice(sel_np.size, size=7), True) + sel = dpt.asarray(sel_np) + + res = x[sel[::-1]] + expected_res = dpt.asnumpy(x)[sel_np[::-1]] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel[::-1], x) + assert (dpt.asnumpy(res2) == expected_res).all() + def test_extract_all_2d(): get_queue_or_skip() @@ -1287,6 +1300,38 @@ def test_nonzero(): assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all() +def test_nonzero_f_contig(): + "See gh-1370" + get_queue_or_skip + + mask = dpt.zeros((5, 5), dtype="?", order="F") + mask[2, 3] = True + + expected_res = (2, 3) + res = dpt.nonzero(mask) + + assert expected_res == res + assert mask[res] + + +def test_nonzero_compacting(): + """See gh-1370. + Test with input where dimensionality + of iteration space is compacted from 3d to 2d + """ + get_queue_or_skip + + mask = dpt.zeros((5, 5, 5), dtype="?", order="F") + mask[3, 2, 1] = True + mask_view = mask[..., :3] + + expected_res = (3, 2, 1) + res = dpt.nonzero(mask_view) + + assert expected_res == res + assert mask_view[res] + + def test_assign_scalar(): get_queue_or_skip() x = dpt.arange(-5, 5, dtype="i8") diff --git a/dpctl/tests/test_usm_ndarray_linalg.py b/dpctl/tests/test_usm_ndarray_linalg.py new file mode 100644 index 0000000000..4023eb8ad7 --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_linalg.py @@ -0,0 +1,48 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip + + +def test_matrix_transpose(): + get_queue_or_skip() + + X = dpt.reshape(dpt.arange(2 * 3, dtype="i4"), (2, 3)) + res = dpt.matrix_transpose(X) + expected_res = X.mT + + assert expected_res.shape == res.shape + assert expected_res.flags["C"] == res.flags["C"] + assert expected_res.flags["F"] == res.flags["F"] + assert dpt.all(X.mT == res) + + +def test_matrix_transpose_arg_validation(): + get_queue_or_skip() + + X = dpt.empty(5, dtype="i4") + with pytest.raises(ValueError): + dpt.matrix_transpose(X) + + X = dict() + with pytest.raises(TypeError): + dpt.matrix_transpose(X) + + X = dpt.empty((5, 5), dtype="i4") + assert isinstance(dpt.matrix_transpose(X), dpt.usm_ndarray) diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py index 1cee5e6c8f..6152a15aae 100644 --- a/dpctl/tests/test_usm_ndarray_manipulation.py +++ b/dpctl/tests/test_usm_ndarray_manipulation.py @@ -648,6 +648,19 @@ def test_roll_2d(data): assert_array_equal(Ynp, dpt.asnumpy(Y)) +def test_roll_validation(): + get_queue_or_skip() + + X = dict() + with pytest.raises(TypeError): + dpt.roll(X) + + X = dpt.empty((1, 2, 3)) + shift = ((2, 3, 1), (1, 2, 3)) + with pytest.raises(ValueError): + dpt.roll(X, shift=shift, axis=(0, 1, 2)) + + def test_concat_incorrect_type(): Xnp = np.ones((2, 2)) pytest.raises(TypeError, dpt.concat) diff --git a/libsyclinterface/include/dpctl_data_types.h b/libsyclinterface/include/dpctl_data_types.h index c8c812b86d..2e644f1327 100644 --- a/libsyclinterface/include/dpctl_data_types.h +++ b/libsyclinterface/include/dpctl_data_types.h @@ -81,21 +81,21 @@ typedef signed int ssize_t; #endif /* _MSC_VER */ /*! - @brief Represents tha largest possible value of a 64 bit signed integer. + @brief Represents the largest possible value of a 64 bit signed integer. */ #if !defined(INT64_MAX) #define INT64_MAX 9223372036854775807LL #endif /*! - @brief Represents tha smallest possible value of a 64 bit signed integer. + @brief Represents the smallest possible value of a 64 bit signed integer. */ #if !defined(INT64_MIN) #define INT64_MIN ((-INT64_MAX) - 1) #endif /*! - @brief Represents tha largest possible value of a 64bit unsigned integer. + @brief Represents the largest possible value of a 64bit unsigned integer. */ #if !defined(UINT64_MAX) #define UINT64_MAX 0xffffffffffffffffULL diff --git a/libsyclinterface/include/dpctl_device_selection.hpp b/libsyclinterface/include/dpctl_device_selection.hpp index 6aa4e69ec7..9da0072ab1 100644 --- a/libsyclinterface/include/dpctl_device_selection.hpp +++ b/libsyclinterface/include/dpctl_device_selection.hpp @@ -1,4 +1,5 @@ -//===-- dpctl_device_selection.h - Device selector class declar. --*-C++-*- =// +//===-- dpctl_device_selection.h - +// Device selector class declaration --*-C++-*- =// // // // Data Parallel Control (dpctl) diff --git a/libsyclinterface/include/dpctl_sycl_device_interface.h b/libsyclinterface/include/dpctl_sycl_device_interface.h index 50c4d1f1f2..3050d7449f 100644 --- a/libsyclinterface/include/dpctl_sycl_device_interface.h +++ b/libsyclinterface/include/dpctl_sycl_device_interface.h @@ -233,19 +233,6 @@ DPCTL_API __dpctl_keep size_t * DPCTLDevice_GetMaxWorkItemSizes3d(__dpctl_keep const DPCTLSyclDeviceRef DRef); -/*! - * @brief Wrapper for deprecated get_info(). - * - * @param DRef Opaque pointer to a ``sycl::device`` - * @return Returns the valid result if device exists else returns NULL. - * @ingroup DeviceInterface - */ -#if __cplusplus || (defined(__GNUC__) && __GNUC__ > 10) -[[deprecated("Use DPCTLDevice_WorkItemSizes3d instead")]] -#endif -DPCTL_API __dpctl_keep size_t * -DPCTLDevice_GetMaxWorkItemSizes(__dpctl_keep const DPCTLSyclDeviceRef DRef); - /*! * @brief Wrapper for get_info(). * diff --git a/libsyclinterface/include/dpctl_sycl_device_selector_interface.h b/libsyclinterface/include/dpctl_sycl_device_selector_interface.h index c1cd5fcd5c..f439f3281a 100644 --- a/libsyclinterface/include/dpctl_sycl_device_selector_interface.h +++ b/libsyclinterface/include/dpctl_sycl_device_selector_interface.h @@ -19,7 +19,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This header declares C contructors for the various SYCL device_selector +/// This header declares C constructors for the various SYCL device_selector /// classes. /// //===----------------------------------------------------------------------===// diff --git a/libsyclinterface/include/dpctl_sycl_event_interface.h b/libsyclinterface/include/dpctl_sycl_event_interface.h index a49d17f12e..6bb4d0412c 100644 --- a/libsyclinterface/include/dpctl_sycl_event_interface.h +++ b/libsyclinterface/include/dpctl_sycl_event_interface.h @@ -44,7 +44,7 @@ DPCTL_C_EXTERN_C_BEGIN DPCTL_DECLARE_VECTOR(Event) /*! - * @brief A wrapper for ``sycl::event`` contructor to construct a new event. + * @brief A wrapper for ``sycl::event`` constructor to construct a new event. * * @return An opaque DPCTLSyclEventRef pointer wrapping a ``sycl::event``. * @ingroup EventInterface diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h index 8dd07280d2..1c5e53a395 100644 --- a/libsyclinterface/include/dpctl_sycl_queue_interface.h +++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h @@ -42,8 +42,8 @@ DPCTL_C_EXTERN_C_BEGIN */ /*! - * @brief A wrapper for sycl::queue contructor to construct a new queue from the - * provided context, device, async handler and propertis bit flags. + * @brief A wrapper for sycl::queue constructor to construct a new queue from + * the provided context, device, async handler and propertis bit flags. * * @param CRef An opaque pointer to a sycl::context. * @param DRef An opaque pointer to a sycl::device @@ -362,7 +362,7 @@ DPCTL_API size_t DPCTLQueue_Hash(__dpctl_keep const DPCTLSyclQueueRef QRef); /*! - * @brief C-API wraper for ``sycl::queue::submit_barrier()``. + * @brief C-API wrapper for ``sycl::queue::submit_barrier()``. * * @param QRef An opaque pointer to the ``sycl::queue``. * @return An opaque pointer to the ``sycl::event`` returned by the @@ -373,7 +373,7 @@ __dpctl_give DPCTLSyclEventRef DPCTLQueue_SubmitBarrier(__dpctl_keep const DPCTLSyclQueueRef QRef); /*! - * @brief C-API wraper for ``sycl::queue::submit_barrier(event_vector)``. + * @brief C-API wrapper for ``sycl::queue::submit_barrier(event_vector)``. * * @param QRef An opaque pointer to the ``sycl::queue``. * @param DepEvents List of dependent DPCTLSyclEventRef objects (events) diff --git a/libsyclinterface/include/dpctl_vector.h b/libsyclinterface/include/dpctl_vector.h index fea49925ea..792368bbc3 100644 --- a/libsyclinterface/include/dpctl_vector.h +++ b/libsyclinterface/include/dpctl_vector.h @@ -83,8 +83,8 @@ DPCTL_C_EXTERN_C_BEGIN /*! \ @brief Returns the element at the specified index. \ @param VRef Opaque pointer to a vector. \ - @param index The index postion of the element to be returned. \ - @return The element at the specified postion, if the index position is \ + @param index The index position of the element to be returned. \ + @return The element at the specified position, if the index position is \ out of bounds then a nullptr is returned. \ */ \ DPCTL_API \ diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp index bcdd46a9fd..73dfcc7e11 100644 --- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp @@ -266,12 +266,6 @@ DPCTLDevice_GetMaxWorkItemSizes3d(__dpctl_keep const DPCTLSyclDeviceRef DRef) return DPCTLDevice__GetMaxWorkItemSizes<3>(DRef); } -__dpctl_keep size_t * -DPCTLDevice_GetMaxWorkItemSizes(__dpctl_keep const DPCTLSyclDeviceRef DRef) -{ - return DPCTLDevice__GetMaxWorkItemSizes<3>(DRef); -} - size_t DPCTLDevice_GetMaxWorkGroupSize(__dpctl_keep const DPCTLSyclDeviceRef DRef) { diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index 387374a032..ce318ce37e 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -143,6 +143,9 @@ std::unique_ptr create_property_list(int properties) propList = std::make_unique(sycl::property::queue::in_order()); } + else { + propList = std::make_unique(); + } if (_prop) { std::stringstream ss; @@ -185,7 +188,7 @@ DPCTLQueue_Create(__dpctl_keep const DPCTLSyclContextRef CRef, } auto propList = create_property_list(properties); - if (propList && handler) { + if (handler) { try { auto Queue = new queue(*ctx, *dev, DPCTL_AsyncErrorHandler(handler), *propList); @@ -194,26 +197,9 @@ DPCTLQueue_Create(__dpctl_keep const DPCTLSyclContextRef CRef, error_handler(e, __FILE__, __func__, __LINE__); } } - else if (properties) { - try { - auto Queue = new queue(*ctx, *dev, *propList); - q = wrap(Queue); - } catch (std::exception const &e) { - error_handler(e, __FILE__, __func__, __LINE__); - } - } - else if (handler) { - try { - auto Queue = - new queue(*ctx, *dev, DPCTL_AsyncErrorHandler(handler)); - q = wrap(Queue); - } catch (std::exception const &e) { - error_handler(e, __FILE__, __func__, __LINE__); - } - } else { try { - auto Queue = new queue(*ctx, *dev); + auto Queue = new queue(*ctx, *dev, *propList); q = wrap(Queue); } catch (std::exception const &e) { error_handler(e, __FILE__, __func__, __LINE__); diff --git a/libsyclinterface/tests/test_sycl_device_interface.cpp b/libsyclinterface/tests/test_sycl_device_interface.cpp index 7e92c8c9de..dd20c738df 100644 --- a/libsyclinterface/tests/test_sycl_device_interface.cpp +++ b/libsyclinterface/tests/test_sycl_device_interface.cpp @@ -175,14 +175,6 @@ TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkItemSizes3d) EXPECT_NO_FATAL_FAILURE(DPCTLSize_t_Array_Delete(sizes)); } -TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkItemSizes) -{ - size_t *sizes = nullptr; - EXPECT_NO_FATAL_FAILURE(sizes = DPCTLDevice_GetMaxWorkItemSizes(DRef)); - EXPECT_TRUE(sizes != nullptr); - EXPECT_NO_FATAL_FAILURE(DPCTLSize_t_Array_Delete(sizes)); -} - TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkGroupSize) { size_t n = 0; @@ -625,10 +617,24 @@ TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemDims) ASSERT_TRUE(md == 0); } -TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes) +TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes1d) +{ + size_t *sz = nullptr; + EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes1d(Null_DRef)); + ASSERT_TRUE(sz == nullptr); +} + +TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes2d) +{ + size_t *sz = nullptr; + EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes2d(Null_DRef)); + ASSERT_TRUE(sz == nullptr); +} + +TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes3d) { size_t *sz = nullptr; - EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes(Null_DRef)); + EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes3d(Null_DRef)); ASSERT_TRUE(sz == nullptr); } diff --git a/scripts/gen_coverage.py b/scripts/gen_coverage.py index 41b30d00d9..93328f1b85 100644 --- a/scripts/gen_coverage.py +++ b/scripts/gen_coverage.py @@ -57,7 +57,7 @@ def run( "-DDPCTL_BUILD_CAPI_TESTS=ON", "-DDPCTL_COVERAGE_REPORT_OUTPUT_DIR=" + setup_dir, ] - env = None + env = dict() if bin_llvm: env = { "PATH": ":".join((os.environ.get("PATH", ""), bin_llvm)), diff --git a/scripts/gen_docs.py b/scripts/gen_docs.py index 377b37f6d8..9e2285a477 100644 --- a/scripts/gen_docs.py +++ b/scripts/gen_docs.py @@ -59,7 +59,7 @@ def run( cmake_args.append("-DDPCTL_ENABLE_DOXYREST=ON") cmake_args.append("-DDoxyrest_DIR=" + doxyrest_dir) - env = None + env = dict() if bin_llvm: env = { "PATH": ":".join((os.environ.get("PATH", ""), bin_llvm)),