From 7ed9c7e8a38bcb2fb1ba54e1f945b1edf9e1ef77 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Tue, 15 Apr 2025 16:29:00 +0200
Subject: [PATCH 01/13] Initial commit

---
 .../extensions/statistics/CMakeLists.txt      |   1 +
 .../extensions/statistics/kth_element1d.cpp   | 381 ++++++++++++++++++
 .../extensions/statistics/kth_element1d.hpp   |  56 +++
 .../extensions/statistics/partitioning.hpp    | 356 ++++++++++++++++
 .../extensions/statistics/statistics_py.cpp   |   2 +
 5 files changed, 796 insertions(+)
 create mode 100644 dpnp/backend/extensions/statistics/kth_element1d.cpp
 create mode 100644 dpnp/backend/extensions/statistics/kth_element1d.hpp
 create mode 100644 dpnp/backend/extensions/statistics/partitioning.hpp
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 2a5467bff382..1fa481546e53 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -30,6 +30,7 @@ set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/histogram.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/histogramdd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/histogram_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/kth_element1d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sliding_dot_product1d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sliding_window1d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/statistics_py.cpp
diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
new file mode 100644
index 000000000000..9e2a2e235886
--- /dev/null
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -0,0 +1,381 @@
+//*****************************************************************************
+// Copyright (c) 2024-2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <cmath>
+#include <complex>
+#include <memory>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+// dpctl tensor headers
+#include "dpctl4pybind11.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "ext/common.hpp"
+#include "kth_element1d.hpp"
+#include "partitioning.hpp"
+
+// #include <iostream>
+
+namespace sycl_exp = sycl::ext::oneapi::experimental;
+namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
+namespace dpctl_utils = dpctl::tensor::alloc_utils;
+
+using dpctl::tensor::usm_ndarray;
+
+using namespace statistics::partitioning;
+using namespace ext::common;
+
+namespace
+{
+
+template <typename T>
+struct pick_pivot_kernel;
+
+template <typename T>
+struct KthElementF
+{
+    static sycl::event run_pick_pivot(sycl::queue &queue,
+                                      T *in,
+                                      T *out,
+                                      uint64_t target,
+                                      State<T> &state,
+                                      uint64_t items_to_sort,
+                                      uint64_t limit,
+                                      const std::vector<sycl::event> &deps)
+    {
+        auto e = queue.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(deps);
+            constexpr uint64_t group_size = 128;
+
+            auto work_sz = make_ndrange(group_size, group_size, 1);
+
+            size_t temp_memory_size =
+                sycl_exp::default_sorters::joint_sorter<>::memory_required<T>(
+                    sycl::memory_scope::work_group, limit);
+
+            auto loc_items =
+                sycl::local_accessor<T, 1>(sycl::range<1>(items_to_sort), cgh);
+            auto scratch = sycl::local_accessor<std::byte, 1>(
+                sycl::range<1>(temp_memory_size), cgh);
+
+            cgh.parallel_for<pick_pivot_kernel<T>>(
+                work_sz, [=](sycl::nd_item<1> item) {
+                    auto group = item.get_group();
+
+                    if (state.stop[0])
+                        return;
+
+                    auto llid = item.get_local_linear_id();
+                    auto local_size = item.get_group_range(0);
+
+                    uint64_t num_elems = 0;
+                    bool target_found = false;
+
+                    T *_in = nullptr;
+                    if (group.leader()) {
+                        state.update_counters();
+                        auto less_count = state.counters.less_count[0];
+                        bool left = target < less_count;
+                        state.left[0] = left;
+
+                        if (left) {
+                            _in = in;
+                            num_elems = state.iteration_counters.less_count[0];
+                            if (target + 1 == less_count) {
+                                _in[num_elems] = state.pivot[0];
+                                state.counters.less_count[0] += 1;
+                                num_elems += 1;
+                            }
+                        }
+                        else {
+                            num_elems =
+                                state.iteration_counters.greater_equal_count[0];
+                            _in = in + state.n - num_elems;
+
+                            if (target + 1 <
+                                less_count +
+                                    state.iteration_counters.equal_count[0]) {
+                                state.values[0] = state.pivot[0];
+                                state.values[1] = state.pivot[0];
+
+                                state.stop[0] = true;
+                                state.target_found[0] = true;
+                                target_found = true;
+                            }
+                        }
+
+                        state.reset_iteration_counters();
+                    }
+
+                    target_found =
+                        sycl::group_broadcast(group, target_found, 0);
+                    _in = sycl::group_broadcast(group, _in, 0);
+                    num_elems = sycl::group_broadcast(group, num_elems, 0);
+
+                    if (target_found) {
+                        return;
+                    }
+
+                    if (num_elems <= limit) {
+                        auto gh = sycl_exp::group_with_scratchpad(
+                            group, sycl::span{&scratch[0], temp_memory_size});
+                        sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems]);
+
+                        if (group.leader()) {
+                            uint64_t offset = state.counters.less_count[0];
+                            if (state.left[0]) {
+                                offset =
+                                    state.counters.less_count[0] - num_elems;
+                            }
+
+                            uint64_t idx = target - offset;
+                            state.values[0] = _in[idx];
+                            state.values[1] = _in[idx + 1];
+
+                            state.stop[0] = true;
+                            state.target_found[0] = true;
+                        }
+
+                        return;
+                    }
+
+                    uint64_t step = num_elems / items_to_sort;
+                    for (uint32_t i = llid; i < items_to_sort; i += local_size)
+                    {
+                        loc_items[i] = std::numeric_limits<T>::max();
+                        uint32_t idx = i * step;
+                        if (idx < num_elems) {
+                            loc_items[i] = _in[idx];
+                        }
+                    }
+
+                    sycl::group_barrier(group);
+
+                    auto gh = sycl_exp::group_with_scratchpad(
+                        group, sycl::span{&scratch[0], temp_memory_size});
+                    sycl_exp::joint_sort(gh, &loc_items[0],
+                                         &loc_items[0] + items_to_sort);
+
+                    T new_pivot = loc_items[items_to_sort / 2];
+
+                    if (new_pivot != state.pivot[0]) {
+                        if (group.leader()) {
+                            state.pivot[0] = new_pivot;
+                            state.num_elems[0] = num_elems;
+                        }
+                        return;
+                    }
+
+                    auto start = llid + items_to_sort / 2 + 1;
+                    uint32_t index = start;
+                    for (uint32_t i = start; i < items_to_sort; i += local_size)
+                    {
+                        if (loc_items[i] != new_pivot) {
+                            index = i;
+                            break;
+                        }
+                    }
+
+                    index = sycl::reduce_over_group(group, index,
+                                                    sycl::minimum<>());
+                    if (group.leader()) {
+                        state.pivot[0] = loc_items[index];
+                        state.num_elems[0] = num_elems;
+                    }
+                });
+        });
+
+        return e;
+    }
+
+    static sycl::event run_partition(sycl::queue &exec_q,
+                                     T *in,
+                                     T *out,
+                                     PartitionState<T> &state,
+                                     const std::vector<sycl::event> &deps)
+    {
+
+        uint32_t group_size = 128;
+        auto e = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(deps);
+
+            constexpr uint32_t WorkPI = 4; // empirically found number
+
+            auto work_range = make_ndrange(state.n, group_size, WorkPI);
+            submit_partition_one_pivot<T, WorkPI>(cgh, work_range, in, out,
+                                                  state);
+        });
+
+        return e;
+    }
+
+    static sycl::event run_kth_element(sycl::queue &exec_q,
+                                       const T *in,
+                                       T *partitioned,
+                                       const size_t k,
+                                       State<T> &state,
+                                       PartitionState<T> &pstate,
+                                       const std::vector<sycl::event> &depends)
+    {
+        uint32_t items_to_sort = 128;
+        uint32_t limit = 4 * items_to_sort;
+        uint32_t iterations =
+            std::ceil(std::log(double(state.n) / limit) / std::log(2));
+
+        auto temp_buff = dpctl_utils::smart_malloc<T>(state.n, exec_q,
+                                                      sycl::usm::alloc::device);
+
+        auto prev = run_pick_pivot(exec_q, const_cast<T *>(in), partitioned, k,
+                                   state, items_to_sort, limit, depends);
+        prev = run_partition(exec_q, const_cast<T *>(in), partitioned, pstate,
+                             {prev});
+
+        T *_in = partitioned;
+        T *_out = temp_buff.get();
+        for (uint32_t i = 0; i < iterations - 1; ++i) {
+            prev = run_pick_pivot(exec_q, _in, _out, k, state, limit,
+                                  items_to_sort, {prev});
+            prev = run_partition(exec_q, _in, _out, pstate, {prev});
+            std::swap(_in, _out);
+        }
+        prev = run_pick_pivot(exec_q, _in, _out, k, state, limit, items_to_sort,
+                              {prev});
+
+        return prev;
+    }
+
+    static std::tuple<bool, uint64_t, uint64_t, uint64_t>
+        impl(sycl::queue &exec_queue,
+             const void *v_ain,
+             void *v_partitioned,
+             const size_t a_size,
+             const size_t k,
+             const std::vector<sycl::event> &depends)
+    {
+        const T *ain = static_cast<const T *>(v_ain);
+        T *partitioned = static_cast<T *>(v_partitioned);
+
+        State<T> state(exec_queue, a_size, partitioned);
+        PartitionState<T> pstate(state);
+
+        auto init_e = state.init(exec_queue, depends);
+        init_e = pstate.init(exec_queue, {init_e});
+
+        auto evt = run_kth_element(exec_queue, ain, partitioned, k, state,
+                                   pstate, {init_e});
+
+        bool found = false;
+        bool left = false;
+        uint64_t less_count = 0;
+        uint64_t greater_equal_count = 0;
+        uint64_t num_elems = 0;
+        auto copy_evt = exec_queue.copy(state.target_found, &found, 1, evt);
+        copy_evt = exec_queue.copy(state.left, &left, 1, copy_evt);
+        copy_evt = exec_queue.copy(state.counters.less_count, &less_count, 1,
+                                   copy_evt);
+        copy_evt = exec_queue.copy(state.counters.greater_equal_count,
+                                   &greater_equal_count, 1, copy_evt);
+        copy_evt = exec_queue.copy(state.num_elems, &num_elems, 1, copy_evt);
+
+        copy_evt.wait();
+
+        uint64_t buff_offset = 0;
+        uint64_t elems_offset = less_count;
+        if (!found) {
+            if (left) {
+                elems_offset = less_count - num_elems;
+            }
+            else {
+                buff_offset = a_size - num_elems;
+            }
+        }
+        else {
+            num_elems = 2;
+            elems_offset = k;
+        }
+
+        state.cleanup(exec_queue);
+
+        return {found, buff_offset, elems_offset, num_elems};
+    }
+};
+
+using SupportedTypes =
+    std::tuple<uint32_t, int32_t, uint64_t, int64_t, float, double>;
+} // namespace
+
+KthElement1d::KthElement1d() : dispatch_table("a")
+{
+    dispatch_table.populate_dispatch_table<SupportedTypes, KthElementF>();
+}
+
+std::tuple<bool, uint64_t, uint64_t, uint64_t>
+    KthElement1d::call(const dpctl::tensor::usm_ndarray &a,
+                       dpctl::tensor::usm_ndarray &partitioned,
+                       const size_t k,
+                       const std::vector<sycl::event> &depends)
+{
+    // validate(a, partitioned, k);
+
+    const int a_typenum = a.get_typenum();
+    auto kth_elem_func = dispatch_table.get(a_typenum);
+
+    auto exec_q = a.get_queue();
+    auto result = kth_elem_func(exec_q, a.get_data(), partitioned.get_data(),
+                                a.get_shape(0), k, depends);
+
+    return result;
+}
+
+std::unique_ptr<KthElement1d> kth;
+
+void statistics::partitioning::populate_kth_element1d(py::module_ m)
+{
+    using namespace std::placeholders;
+
+    kth.reset(new KthElement1d());
+
+    auto kth_func = [kthp = kth.get()](
+                        const dpctl::tensor::usm_ndarray &a,
+                        dpctl::tensor::usm_ndarray &partitioned, const size_t k,
+                        const std::vector<sycl::event> &depends) {
+        return kthp->call(a, partitioned, k, depends);
+    };
+
+    m.def("kth_element", kth_func, "finding k and k+1 elements.", py::arg("a"),
+          py::arg("partitioned"), py::arg("k"),
+          py::arg("depends") = py::list());
+
+    auto kth_dtypes = [kthp = kth.get()]() {
+        return kthp->dispatch_table.get_all_supported_types();
+    };
+
+    m.def("kth_element_dtypes", kth_dtypes,
+          "Get the supported data types for kth_element.");
+}
diff --git a/dpnp/backend/extensions/statistics/kth_element1d.hpp b/dpnp/backend/extensions/statistics/kth_element1d.hpp
new file mode 100644
index 000000000000..06219823423b
--- /dev/null
+++ b/dpnp/backend/extensions/statistics/kth_element1d.hpp
@@ -0,0 +1,56 @@
+//*****************************************************************************
+// Copyright (c) 2024-2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include "ext/dispatch_table.hpp"
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+
+namespace statistics::partitioning
+{
+struct KthElement1d
+{
+    using FnT = std::tuple<bool, uint64_t, uint64_t, uint64_t> (*)(
+        sycl::queue &,
+        const void *,
+        void *,
+        const size_t,
+        const size_t,
+        const std::vector<sycl::event> &);
+
+    ext::common::DispatchTable<FnT> dispatch_table;
+
+    KthElement1d();
+
+    std::tuple<bool, uint64_t, uint64_t, uint64_t>
+        call(const dpctl::tensor::usm_ndarray &a,
+             dpctl::tensor::usm_ndarray &partitioned,
+             uint64_t k,
+             const std::vector<sycl::event> &depends);
+};
+
+void populate_kth_element1d(py::module_ m);
+} // namespace statistics::partitioning
diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
new file mode 100644
index 000000000000..b249272a9c74
--- /dev/null
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -0,0 +1,356 @@
+//*****************************************************************************
+// Copyright (c) 2024-2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include "utils/math_utils.hpp"
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include <stdio.h>
+
+#include "ext/common.hpp"
+
+using dpctl::tensor::usm_ndarray;
+
+using ext::common::AtomicOp;
+using ext::common::IsNan;
+using ext::common::Less;
+using ext::common::make_ndrange;
+
+namespace statistics::partitioning
+{
+
+struct Counters
+{
+    uint64_t *less_count;
+    uint64_t *equal_count;
+    uint64_t *greater_equal_count;
+    uint64_t *nan_count;
+
+    Counters(sycl::queue &queue)
+    {
+        less_count = sycl::malloc_device<uint64_t>(1, queue);
+        equal_count = sycl::malloc_device<uint64_t>(1, queue);
+        greater_equal_count = sycl::malloc_device<uint64_t>(1, queue);
+        nan_count = sycl::malloc_device<uint64_t>(1, queue);
+    };
+
+    void cleanup(sycl::queue &queue)
+    {
+        sycl::free(less_count, queue);
+        sycl::free(equal_count, queue);
+        sycl::free(greater_equal_count, queue);
+        sycl::free(nan_count, queue);
+    }
+};
+
+template <typename T>
+struct State
+{
+    Counters counters;
+    Counters iteration_counters;
+
+    bool *stop;
+    bool *target_found;
+    bool *left;
+
+    T *pivot;
+    T *values;
+
+    size_t *num_elems;
+
+    size_t n;
+
+    State(sycl::queue &queue, size_t _n, T *values_buff)
+        : counters(queue), iteration_counters(queue)
+    {
+        stop = sycl::malloc_device<bool>(1, queue);
+        target_found = sycl::malloc_device<bool>(1, queue);
+        left = sycl::malloc_device<bool>(1, queue);
+
+        pivot = sycl::malloc_device<T>(1, queue);
+        values = values_buff;
+
+        num_elems = sycl::malloc_device<size_t>(1, queue);
+
+        n = _n;
+    }
+
+    sycl::event init(sycl::queue &queue, const std::vector<sycl::event> &deps)
+    {
+        sycl::event fill_e =
+            queue.fill<uint64_t>(counters.less_count, 0, 1, deps);
+        fill_e = queue.fill<uint64_t>(counters.equal_count, 0, 1, {fill_e});
+        fill_e =
+            queue.fill<uint64_t>(counters.greater_equal_count, n, 1, {fill_e});
+        fill_e = queue.fill<uint64_t>(counters.nan_count, 0, 1, {fill_e});
+        fill_e = queue.fill<uint64_t>(num_elems, 0, 1, {fill_e});
+        fill_e = queue.fill<bool>(stop, false, 1, {fill_e});
+        fill_e = queue.fill<bool>(target_found, false, 1, {fill_e});
+        fill_e = queue.fill<bool>(left, false, 1, {fill_e});
+        fill_e = queue.fill<T>(pivot, 0, 1, {fill_e});
+
+        return fill_e;
+    }
+
+    void update_counters() const
+    {
+        if (*left) {
+            counters.less_count[0] -= iteration_counters.greater_equal_count[0];
+            counters.greater_equal_count[0] +=
+                iteration_counters.greater_equal_count[0];
+        }
+        else {
+            counters.less_count[0] += iteration_counters.less_count[0];
+            counters.greater_equal_count[0] -= iteration_counters.less_count[0];
+        }
+        counters.equal_count[0] = iteration_counters.equal_count[0];
+        counters.nan_count[0] += iteration_counters.nan_count[0];
+    }
+
+    void reset_iteration_counters() const
+    {
+        iteration_counters.less_count[0] = 0;
+        iteration_counters.equal_count[0] = 0;
+        iteration_counters.greater_equal_count[0] = 0;
+        iteration_counters.nan_count[0] = 0;
+    }
+
+    void cleanup(sycl::queue &queue)
+    {
+        counters.cleanup(queue);
+        iteration_counters.cleanup(queue);
+
+        sycl::free(stop, queue);
+        sycl::free(target_found, queue);
+        sycl::free(left, queue);
+
+        sycl::free(num_elems, queue);
+        sycl::free(pivot, queue);
+    }
+};
+
+template <typename T>
+struct PartitionState
+{
+    Counters iteration_counters;
+
+    bool *stop;
+    bool *left;
+
+    T *pivot;
+
+    size_t n;
+    size_t *num_elems;
+
+    PartitionState(State<T> &state)
+        : iteration_counters(state.iteration_counters)
+    {
+        stop = state.stop;
+        left = state.left;
+
+        num_elems = state.num_elems;
+        pivot = state.pivot;
+
+        n = state.n;
+    }
+
+    sycl::event init(sycl::queue &queue, const std::vector<sycl::event> &deps)
+    {
+        sycl::event fill_e =
+            queue.fill<uint64_t>(iteration_counters.less_count, n, 1, deps);
+        fill_e = queue.fill<uint64_t>(iteration_counters.equal_count, 0, 1,
+                                      {fill_e});
+        fill_e = queue.fill<uint64_t>(iteration_counters.greater_equal_count, 0,
+                                      1, {fill_e});
+        fill_e =
+            queue.fill<uint64_t>(iteration_counters.nan_count, 0, 1, {fill_e});
+
+        return fill_e;
+    }
+};
+
+template <typename T>
+class partition_one_pivot_kernel;
+
+template <typename T, uint32_t WorkPI>
+void submit_partition_one_pivot(sycl::handler &cgh,
+                                sycl::nd_range<1> work_sz,
+                                T *in,
+                                T *out,
+                                PartitionState<T> &state)
+{
+    auto loc_counters =
+        sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
+    cgh.parallel_for<partition_one_pivot_kernel<T>>(
+        work_sz, [=](sycl::nd_item<1> item) {
+            if (state.stop[0])
+                return;
+
+            auto group = item.get_group();
+            uint64_t items_per_group = group.get_local_range(0) * WorkPI;
+            uint64_t num_elems = state.num_elems[0];
+
+            if (group.get_group_id(0) * items_per_group >= num_elems)
+                return;
+
+            T *_in = nullptr;
+            if (state.left[0]) {
+                _in = in;
+            }
+            else {
+                _in = in + state.n - num_elems;
+            }
+
+            auto value = state.pivot[0];
+
+            auto sbg = item.get_sub_group();
+            uint32_t sbg_size = sbg.get_max_local_range()[0];
+
+            uint64_t i_base =
+                (item.get_global_linear_id() - sbg.get_local_linear_id()) *
+                WorkPI;
+
+            if (group.leader()) {
+                loc_counters[0] = 0;
+                loc_counters[1] = 0;
+                loc_counters[2] = 0;
+            }
+
+            sycl::group_barrier(group);
+
+            uint32_t less_count = 0;
+            uint32_t equal_count = 0;
+            uint32_t greater_equal_count = 0;
+            uint32_t nan_count = 0;
+
+            T values[WorkPI];
+            uint32_t actual_count = 0;
+            uint64_t local_i_base = i_base + sbg.get_local_linear_id();
+
+            for (uint32_t _i = 0; _i < WorkPI; ++_i) {
+                auto i = local_i_base + _i * sbg_size;
+                if (i < num_elems) {
+                    values[_i] = _in[i];
+                    less_count += Less<T>{}(values[_i], value);
+                    equal_count += values[_i] == value;
+                    nan_count += IsNan<T>::isnan(values[_i]);
+                    actual_count++;
+                }
+            }
+
+            greater_equal_count = actual_count - less_count;
+
+            auto sbg_less_equal =
+                sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
+            auto sbg_equal =
+                sycl::reduce_over_group(sbg, equal_count, sycl::plus<>());
+            auto sbg_greater = sycl::reduce_over_group(sbg, greater_equal_count,
+                                                       sycl::plus<>());
+
+            uint32_t local_less_offset = 0;
+            uint32_t local_gr_offset = 0;
+            if (sbg.leader()) {
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::work_group>
+                    gr_less_eq(loc_counters[0]);
+                local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
+
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::work_group>
+                    gr_eq(loc_counters[1]);
+                gr_eq += sbg_equal;
+
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::work_group>
+                    gr_greater(loc_counters[2]);
+                local_gr_offset = gr_greater.fetch_add(sbg_greater);
+            }
+
+            local_less_offset =
+                sycl::select_from_group(sbg, local_less_offset, 0);
+            local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0);
+
+            sycl::group_barrier(group);
+
+            if (group.leader()) {
+                sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::device>
+                    glbl_less_eq(state.iteration_counters.less_count[0]);
+                auto global_less_eq_offset =
+                    glbl_less_eq.fetch_add(loc_counters[0]);
+
+                sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::device>
+                    glbl_eq(state.iteration_counters.equal_count[0]);
+                glbl_eq += loc_counters[1];
+
+                sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::device>
+                    glbl_greater(
+                        state.iteration_counters.greater_equal_count[0]);
+                auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
+
+                loc_counters[0] = global_less_eq_offset;
+                loc_counters[2] = global_gr_offset;
+            }
+
+            sycl::group_barrier(group);
+
+            auto sbg_less_offset = loc_counters[0] + local_less_offset;
+            auto sbg_gr_offset =
+                state.n - (loc_counters[2] + local_gr_offset + sbg_greater);
+
+            uint32_t le_item_offset = 0;
+            uint32_t gr_item_offset = 0;
+
+            for (uint32_t _i = 0; _i < WorkPI; ++_i) {
+                uint32_t less = values[_i] < value;
+                auto le_pos =
+                    sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
+                auto ge_pos = sbg.get_local_linear_id() - le_pos;
+
+                auto total_le =
+                    sycl::reduce_over_group(sbg, less, sycl::plus<>());
+                auto total_gr = sbg_size - total_le;
+
+                if (_i < actual_count) {
+                    if (less) {
+                        out[sbg_less_offset + le_item_offset + le_pos] =
+                            values[_i];
+                    }
+                    else {
+                        out[sbg_gr_offset + gr_item_offset + ge_pos] =
+                            values[_i];
+                    }
+                    le_item_offset += total_le;
+                    gr_item_offset += total_gr;
+                }
+            }
+        });
+}
+
+} // namespace statistics::partitioning
diff --git a/dpnp/backend/extensions/statistics/statistics_py.cpp b/dpnp/backend/extensions/statistics/statistics_py.cpp
index 6636d3f7d531..757ec85c6222 100644
--- a/dpnp/backend/extensions/statistics/statistics_py.cpp
+++ b/dpnp/backend/extensions/statistics/statistics_py.cpp
@@ -32,12 +32,14 @@
 #include "bincount.hpp"
 #include "histogram.hpp"
 #include "histogramdd.hpp"
+#include "kth_element1d.hpp"
 #include "sliding_dot_product1d.hpp"
 
 PYBIND11_MODULE(_statistics_impl, m)
 {
     statistics::histogram::populate_bincount(m);
     statistics::histogram::populate_histogram(m);
+    statistics::partitioning::populate_kth_element1d(m);
     statistics::sliding_window1d::populate_sliding_dot_product1d(m);
     statistics::histogram::populate_histogramdd(m);
 }

From a8a93da85de93f2cba2b79e2e8a89f0de989acfe Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Wed, 23 Apr 2025 16:33:39 +0200
Subject: [PATCH 02/13] Call implementation from python

---
 .../extensions/statistics/kth_element1d.cpp   | 156 ++++++++++++------
 .../extensions/statistics/kth_element1d.hpp   |   5 +-
 .../extensions/statistics/partitioning.hpp    |   1 +
 dpnp/dpnp_utils/dpnp_utils_statistics.py      |  41 +++++
 4 files changed, 153 insertions(+), 50 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index 9e2a2e235886..7a07f568c9f0 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -40,7 +40,8 @@
 #include "kth_element1d.hpp"
 #include "partitioning.hpp"
 
-// #include <iostream>
+#include <iostream>
+#include <chrono>
 
 namespace sycl_exp = sycl::ext::oneapi::experimental;
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
@@ -67,6 +68,7 @@ struct KthElementF
                                       State<T> &state,
                                       uint64_t items_to_sort,
                                       uint64_t limit,
+                                      bool ret,
                                       const std::vector<sycl::event> &deps)
     {
         auto e = queue.submit([&](sycl::handler &cgh) {
@@ -84,6 +86,12 @@ struct KthElementF
             auto scratch = sycl::local_accessor<std::byte, 1>(
                 sycl::range<1>(temp_memory_size), cgh);
 
+            // std::cout << "temp_memory_size: " << temp_memory_size
+            //     << " items_to_sort: " << items_to_sort
+            //     << " limit: " << limit
+            //     << " group_size: " << group_size << "\n";
+
+            // auto str = sycl::stream(8192, 1024, cgh);
             cgh.parallel_for<pick_pivot_kernel<T>>(
                 work_sz, [=](sycl::nd_item<1> item) {
                     auto group = item.get_group();
@@ -129,7 +137,6 @@ struct KthElementF
                                 target_found = true;
                             }
                         }
-
                         state.reset_iteration_counters();
                     }
 
@@ -142,10 +149,15 @@ struct KthElementF
                         return;
                     }
 
+                    // if (group.leader()) {
+                    //     str << "num_elems: " << num_elems << "\n";
+                    // }
+
                     if (num_elems <= limit) {
                         auto gh = sycl_exp::group_with_scratchpad(
                             group, sycl::span{&scratch[0], temp_memory_size});
-                        sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems]);
+                        if (num_elems > 0)
+                            sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems]);
 
                         if (group.leader()) {
                             uint64_t offset = state.counters.less_count[0];
@@ -154,9 +166,18 @@ struct KthElementF
                                     state.counters.less_count[0] - num_elems;
                             }
 
-                            uint64_t idx = target - offset;
-                            state.values[0] = _in[idx];
-                            state.values[1] = _in[idx + 1];
+                            int64_t idx = target - offset;
+
+                            // if (idx + 1 > (in + state.n - _in) || idx < 0)
+                            // {
+                            //     str << "buffer access out of bounds idx = "
+                            //     << idx << " size " << (in + state.n - _in) << "\n";
+                            // }
+                            // else
+                            {
+                                state.values[0] = _in[idx];
+                                state.values[1] = _in[idx + 1];
+                            }
 
                             state.stop[0] = true;
                             state.target_found[0] = true;
@@ -165,6 +186,9 @@ struct KthElementF
                         return;
                     }
 
+                    // if (ret)
+                    //     return;
+
                     uint64_t step = num_elems / items_to_sort;
                     for (uint32_t i = llid; i < items_to_sort; i += local_size)
                     {
@@ -184,30 +208,30 @@ struct KthElementF
 
                     T new_pivot = loc_items[items_to_sort / 2];
 
-                    if (new_pivot != state.pivot[0]) {
+                    // if (new_pivot != state.pivot[0]) {
                         if (group.leader()) {
                             state.pivot[0] = new_pivot;
                             state.num_elems[0] = num_elems;
                         }
                         return;
-                    }
-
-                    auto start = llid + items_to_sort / 2 + 1;
-                    uint32_t index = start;
-                    for (uint32_t i = start; i < items_to_sort; i += local_size)
-                    {
-                        if (loc_items[i] != new_pivot) {
-                            index = i;
-                            break;
-                        }
-                    }
-
-                    index = sycl::reduce_over_group(group, index,
-                                                    sycl::minimum<>());
-                    if (group.leader()) {
-                        state.pivot[0] = loc_items[index];
-                        state.num_elems[0] = num_elems;
-                    }
+                    // }
+
+                    // auto start = llid + items_to_sort / 2 + 1;
+                    // uint32_t index = start;
+                    // for (uint32_t i = start; i < items_to_sort; i += local_size)
+                    // {
+                    //     if (loc_items[i] != new_pivot) {
+                    //         index = i;
+                    //         break;
+                    //     }
+                    // }
+
+                    // index = sycl::reduce_over_group(group, index,
+                    //                                 sycl::minimum<>());
+                    // if (group.leader()) {
+                    //     state.pivot[0] = loc_items[index];
+                    //     state.num_elems[0] = num_elems;
+                    // }
                 });
         });
 
@@ -225,7 +249,7 @@ struct KthElementF
         auto e = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(deps);
 
-            constexpr uint32_t WorkPI = 4; // empirically found number
+            constexpr uint32_t WorkPI = 1; // empirically found number
 
             auto work_range = make_ndrange(state.n, group_size, WorkPI);
             submit_partition_one_pivot<T, WorkPI>(cgh, work_range, in, out,
@@ -243,34 +267,42 @@ struct KthElementF
                                        PartitionState<T> &pstate,
                                        const std::vector<sycl::event> &depends)
     {
-        uint32_t items_to_sort = 128;
-        uint32_t limit = 4 * items_to_sort;
+        uint32_t items_to_sort = 127;
+        uint32_t limit = 4 * (items_to_sort + 1);
         uint32_t iterations =
-            std::ceil(std::log(double(state.n) / limit) / std::log(2));
+            std::ceil(-std::log(double(state.n) / limit) / std::log(0.536)) + 1;
+        // Ensure iterations are odd so the final result is always stored in 'partitioned'
+        iterations += 1 - iterations % 2;
 
         auto temp_buff = dpctl_utils::smart_malloc<T>(state.n, exec_q,
                                                       sycl::usm::alloc::device);
 
+        std::cout << "Iteration " << 0 << std::endl;
         auto prev = run_pick_pivot(exec_q, const_cast<T *>(in), partitioned, k,
-                                   state, items_to_sort, limit, depends);
+                                   state, items_to_sort, limit, false, depends);
         prev = run_partition(exec_q, const_cast<T *>(in), partitioned, pstate,
                              {prev});
+        // prev.wait();
 
         T *_in = partitioned;
         T *_out = temp_buff.get();
         for (uint32_t i = 0; i < iterations - 1; ++i) {
-            prev = run_pick_pivot(exec_q, _in, _out, k, state, limit,
-                                  items_to_sort, {prev});
+            std::cout << "Iteration " << i + 1 << std::endl;
+            prev = run_pick_pivot(exec_q, _in, _out, k, state,
+                                  items_to_sort, limit, true, {prev});
             prev = run_partition(exec_q, _in, _out, pstate, {prev});
             std::swap(_in, _out);
+            // prev.wait();
+            // if (i % 5 == 0)
+            //     prev.wait();
         }
-        prev = run_pick_pivot(exec_q, _in, _out, k, state, limit, items_to_sort,
-                              {prev});
+        prev = run_pick_pivot(exec_q, _in, _out, k, state, items_to_sort, limit,
+                              true, {prev});
 
         return prev;
     }
 
-    static std::tuple<bool, uint64_t, uint64_t, uint64_t>
+    static KthElement1d::RetT
         impl(sycl::queue &exec_queue,
              const void *v_ain,
              void *v_partitioned,
@@ -278,12 +310,14 @@ struct KthElementF
              const size_t k,
              const std::vector<sycl::event> &depends)
     {
+        auto start = std::chrono::high_resolution_clock::now();
         const T *ain = static_cast<const T *>(v_ain);
         T *partitioned = static_cast<T *>(v_partitioned);
 
         State<T> state(exec_queue, a_size, partitioned);
         PartitionState<T> pstate(state);
 
+        exec_queue.wait();
         auto init_e = state.init(exec_queue, depends);
         init_e = pstate.init(exec_queue, {init_e});
 
@@ -295,6 +329,7 @@ struct KthElementF
         uint64_t less_count = 0;
         uint64_t greater_equal_count = 0;
         uint64_t num_elems = 0;
+        uint64_t nan_count = 0;
         auto copy_evt = exec_queue.copy(state.target_found, &found, 1, evt);
         copy_evt = exec_queue.copy(state.left, &left, 1, copy_evt);
         copy_evt = exec_queue.copy(state.counters.less_count, &less_count, 1,
@@ -302,27 +337,52 @@ struct KthElementF
         copy_evt = exec_queue.copy(state.counters.greater_equal_count,
                                    &greater_equal_count, 1, copy_evt);
         copy_evt = exec_queue.copy(state.num_elems, &num_elems, 1, copy_evt);
-
-        copy_evt.wait();
+        copy_evt = exec_queue.copy(state.counters.nan_count, &nan_count, 1, copy_evt);
 
         uint64_t buff_offset = 0;
         uint64_t elems_offset = less_count;
-        if (!found) {
-            if (left) {
-                elems_offset = less_count - num_elems;
+
+        try
+        {
+            copy_evt.wait();
+
+            if (!found) {
+                if (left) {
+                    elems_offset = less_count - num_elems;
+                }
+                else {
+                    buff_offset = a_size - num_elems;
+                }
             }
             else {
-                buff_offset = a_size - num_elems;
+                num_elems = 2;
+                elems_offset = k;
             }
+
+            state.cleanup(exec_queue);
+            auto end = std::chrono::high_resolution_clock::now();
+
+            auto duration =
+                std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                    .count();
+
+            std::cout << "KthElement1d took " << duration << " microseconds"
+                    << std::endl;
+
+            std::cout << "Found " << found << " left " << left
+                    << " less_count " << less_count
+                    << " greater_equal_count " << greater_equal_count
+                    << " num_elems " << num_elems
+                    << " nan_count " << nan_count
+                    << std::endl;
+            /* code */
         }
-        else {
-            num_elems = 2;
-            elems_offset = k;
+        catch (sycl::exception const &e)
+        {
+            std::cout << e.what() << std::endl;
         }
 
-        state.cleanup(exec_queue);
-
-        return {found, buff_offset, elems_offset, num_elems};
+        return {found, buff_offset, elems_offset, num_elems, nan_count};
     }
 };
 
@@ -335,7 +395,7 @@ KthElement1d::KthElement1d() : dispatch_table("a")
     dispatch_table.populate_dispatch_table<SupportedTypes, KthElementF>();
 }
 
-std::tuple<bool, uint64_t, uint64_t, uint64_t>
+KthElement1d::RetT
     KthElement1d::call(const dpctl::tensor::usm_ndarray &a,
                        dpctl::tensor::usm_ndarray &partitioned,
                        const size_t k,
diff --git a/dpnp/backend/extensions/statistics/kth_element1d.hpp b/dpnp/backend/extensions/statistics/kth_element1d.hpp
index 06219823423b..0507206e439b 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.hpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.hpp
@@ -33,7 +33,8 @@ namespace statistics::partitioning
 {
 struct KthElement1d
 {
-    using FnT = std::tuple<bool, uint64_t, uint64_t, uint64_t> (*)(
+    using RetT = std::tuple<bool, uint64_t, uint64_t, uint64_t, uint64_t>;
+    using FnT = RetT (*)(
         sycl::queue &,
         const void *,
         void *,
@@ -45,7 +46,7 @@ struct KthElement1d
 
     KthElement1d();
 
-    std::tuple<bool, uint64_t, uint64_t, uint64_t>
+    RetT
         call(const dpctl::tensor::usm_ndarray &a,
              dpctl::tensor::usm_ndarray &partitioned,
              uint64_t k,
diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
index b249272a9c74..748a4a16f01c 100644
--- a/dpnp/backend/extensions/statistics/partitioning.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -205,6 +205,7 @@ void submit_partition_one_pivot(sycl::handler &cgh,
 {
     auto loc_counters =
         sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
+    // sycl::stream str(8192, 1024, cgh);
     cgh.parallel_for<partition_one_pivot_kernel<T>>(
         work_sz, [=](sycl::nd_item<1> item) {
             if (state.stop[0])
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 108fda7286fc..47fab796d7d3 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -29,6 +29,9 @@
 import dpctl.tensor as dpt
 from dpctl.tensor._numpy_helper import normalize_axis_tuple
 from dpctl.utils import ExecutionPlacementError
+import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
+
+import dpctl.utils as dpu
 
 import dpnp
 from dpnp.dpnp_array import dpnp_array
@@ -190,6 +193,41 @@ def dpnp_cov(
     c = dpnp.dot(x, x_t.conj()) / fact
     return c.squeeze()
 
+def native_median(a):
+
+    partitioned = dpnp.empty_like(a)
+    a_usm = dpnp.get_usm_ndarray(a)
+    partitioned_usm = dpnp.get_usm_ndarray(partitioned)
+
+    _manager = dpu.SequentialOrderManager[a.sycl_queue]
+
+    result = dpnp.empty_like(a, shape = 1)
+    k = a.shape[0] // 2
+
+    found, buff_offset, elems_offset, num_elems, nan_count = statistics_ext.kth_element(
+        a_usm,
+        partitioned_usm,
+        k,
+        depends=_manager.submitted_events,
+    )
+
+    if found:
+        if a.shape[0] % 2 == 0:
+            # even number of elements
+            result[0] = (partitioned[0] + partitioned[1]) / 2
+        else:
+            result[0] = partitioned[0]
+    else:
+        partitioned[buff_offset:buff_offset + num_elems].sort()
+        kth_idx = buff_offset + k - elems_offset
+        if a.shape[0] % 2 == 0:
+            # even number of elements
+            result[0] = (partitioned[kth_idx] + partitioned[kth_idx + 1]) / 2
+        else:
+            result[0] = partitioned[kth_idx]
+
+    return result
+
 
 def dpnp_median(
     a,
@@ -223,6 +261,9 @@ def dpnp_median(
             )
         axis = -1
 
+    if not isinstance(a.dtype, dpnp.complexfloating) and not ignore_nan and a_ndim == 1:
+        return native_median(a)
+
     if overwrite_input:
         if isinstance(a, dpt.usm_ndarray):
             # dpnp.ndarray.sort only works with dpnp_array

From a518e0c7bba60643a652a08765cb8d13e8f7790c Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Wed, 23 Apr 2025 17:05:42 +0200
Subject: [PATCH 03/13] Add sync

---
 dpnp/backend/extensions/statistics/kth_element1d.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index 7a07f568c9f0..4e5817ca97cc 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -296,6 +296,7 @@ struct KthElementF
             // if (i % 5 == 0)
             //     prev.wait();
         }
+        prev.wait();
         prev = run_pick_pivot(exec_q, _in, _out, k, state, items_to_sort, limit,
                               true, {prev});
 
@@ -359,7 +360,6 @@ struct KthElementF
                 elems_offset = k;
             }
 
-            state.cleanup(exec_queue);
             auto end = std::chrono::high_resolution_clock::now();
 
             auto duration =
@@ -382,6 +382,7 @@ struct KthElementF
             std::cout << e.what() << std::endl;
         }
 
+        state.cleanup(exec_queue);
         return {found, buff_offset, elems_offset, num_elems, nan_count};
     }
 };

From 9119ae9f8c9a8bed45c0a84c05a94c058e28db7d Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 25 Apr 2025 15:48:43 +0200
Subject: [PATCH 04/13] Crash due to buffer deletion and add support for
 complex type

---
 .../extensions/statistics/kth_element1d.cpp   | 138 +++++++-----------
 .../extensions/statistics/partitioning.hpp    |  18 ++-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      |   2 +-
 3 files changed, 62 insertions(+), 96 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index 4e5817ca97cc..fef92b4f71f6 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -68,7 +68,6 @@ struct KthElementF
                                       State<T> &state,
                                       uint64_t items_to_sort,
                                       uint64_t limit,
-                                      bool ret,
                                       const std::vector<sycl::event> &deps)
     {
         auto e = queue.submit([&](sycl::handler &cgh) {
@@ -149,15 +148,11 @@ struct KthElementF
                         return;
                     }
 
-                    // if (group.leader()) {
-                    //     str << "num_elems: " << num_elems << "\n";
-                    // }
-
                     if (num_elems <= limit) {
                         auto gh = sycl_exp::group_with_scratchpad(
                             group, sycl::span{&scratch[0], temp_memory_size});
                         if (num_elems > 0)
-                            sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems]);
+                            sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems], Less<T>{});
 
                         if (group.leader()) {
                             uint64_t offset = state.counters.less_count[0];
@@ -168,16 +163,8 @@ struct KthElementF
 
                             int64_t idx = target - offset;
 
-                            // if (idx + 1 > (in + state.n - _in) || idx < 0)
-                            // {
-                            //     str << "buffer access out of bounds idx = "
-                            //     << idx << " size " << (in + state.n - _in) << "\n";
-                            // }
-                            // else
-                            {
-                                state.values[0] = _in[idx];
-                                state.values[1] = _in[idx + 1];
-                            }
+                            state.values[0] = _in[idx];
+                            state.values[1] = _in[idx + 1];
 
                             state.stop[0] = true;
                             state.target_found[0] = true;
@@ -186,9 +173,6 @@ struct KthElementF
                         return;
                     }
 
-                    // if (ret)
-                    //     return;
-
                     uint64_t step = num_elems / items_to_sort;
                     for (uint32_t i = llid; i < items_to_sort; i += local_size)
                     {
@@ -204,34 +188,34 @@ struct KthElementF
                     auto gh = sycl_exp::group_with_scratchpad(
                         group, sycl::span{&scratch[0], temp_memory_size});
                     sycl_exp::joint_sort(gh, &loc_items[0],
-                                         &loc_items[0] + items_to_sort);
+                                         &loc_items[0] + items_to_sort, Less<T>{});
 
                     T new_pivot = loc_items[items_to_sort / 2];
 
-                    // if (new_pivot != state.pivot[0]) {
+                    if (new_pivot != state.pivot[0]) {
                         if (group.leader()) {
                             state.pivot[0] = new_pivot;
                             state.num_elems[0] = num_elems;
                         }
                         return;
-                    // }
-
-                    // auto start = llid + items_to_sort / 2 + 1;
-                    // uint32_t index = start;
-                    // for (uint32_t i = start; i < items_to_sort; i += local_size)
-                    // {
-                    //     if (loc_items[i] != new_pivot) {
-                    //         index = i;
-                    //         break;
-                    //     }
-                    // }
-
-                    // index = sycl::reduce_over_group(group, index,
-                    //                                 sycl::minimum<>());
-                    // if (group.leader()) {
-                    //     state.pivot[0] = loc_items[index];
-                    //     state.num_elems[0] = num_elems;
-                    // }
+                    }
+
+                    auto start = llid + items_to_sort / 2 + 1;
+                    uint32_t index = start;
+                    for (uint32_t i = start; i < items_to_sort; i += local_size)
+                    {
+                        if (loc_items[i] != new_pivot) {
+                            index = i;
+                            break;
+                        }
+                    }
+
+                    index = sycl::reduce_over_group(group, index,
+                                                    sycl::minimum<>());
+                    if (group.leader()) {
+                        state.pivot[0] = loc_items[index];
+                        state.num_elems[0] = num_elems;
+                    }
                 });
         });
 
@@ -262,6 +246,7 @@ struct KthElementF
     static sycl::event run_kth_element(sycl::queue &exec_q,
                                        const T *in,
                                        T *partitioned,
+                                       T *temp_buff,
                                        const size_t k,
                                        State<T> &state,
                                        PartitionState<T> &pstate,
@@ -274,31 +259,21 @@ struct KthElementF
         // Ensure iterations are odd so the final result is always stored in 'partitioned'
         iterations += 1 - iterations % 2;
 
-        auto temp_buff = dpctl_utils::smart_malloc<T>(state.n, exec_q,
-                                                      sycl::usm::alloc::device);
-
-        std::cout << "Iteration " << 0 << std::endl;
         auto prev = run_pick_pivot(exec_q, const_cast<T *>(in), partitioned, k,
-                                   state, items_to_sort, limit, false, depends);
+                                   state, items_to_sort, limit, depends);
         prev = run_partition(exec_q, const_cast<T *>(in), partitioned, pstate,
                              {prev});
-        // prev.wait();
 
         T *_in = partitioned;
-        T *_out = temp_buff.get();
+        T *_out = temp_buff;
         for (uint32_t i = 0; i < iterations - 1; ++i) {
-            std::cout << "Iteration " << i + 1 << std::endl;
             prev = run_pick_pivot(exec_q, _in, _out, k, state,
-                                  items_to_sort, limit, true, {prev});
+                                  items_to_sort, limit, {prev});
             prev = run_partition(exec_q, _in, _out, pstate, {prev});
             std::swap(_in, _out);
-            // prev.wait();
-            // if (i % 5 == 0)
-            //     prev.wait();
         }
-        prev.wait();
         prev = run_pick_pivot(exec_q, _in, _out, k, state, items_to_sort, limit,
-                              true, {prev});
+                              {prev});
 
         return prev;
     }
@@ -322,7 +297,9 @@ struct KthElementF
         auto init_e = state.init(exec_queue, depends);
         init_e = pstate.init(exec_queue, {init_e});
 
-        auto evt = run_kth_element(exec_queue, ain, partitioned, k, state,
+        auto temp_buff = dpctl_utils::smart_malloc<T>(state.n, exec_queue,
+            sycl::usm::alloc::device);
+        auto evt = run_kth_element(exec_queue, ain, partitioned, temp_buff.get(), k, state,
                                    pstate, {init_e});
 
         bool found = false;
@@ -343,52 +320,37 @@ struct KthElementF
         uint64_t buff_offset = 0;
         uint64_t elems_offset = less_count;
 
-        try
-        {
-            copy_evt.wait();
-
-            if (!found) {
-                if (left) {
-                    elems_offset = less_count - num_elems;
-                }
-                else {
-                    buff_offset = a_size - num_elems;
-                }
+        copy_evt.wait();
+
+        if (!found) {
+            if (left) {
+                elems_offset = less_count - num_elems;
             }
             else {
-                num_elems = 2;
-                elems_offset = k;
+                buff_offset = a_size - num_elems;
             }
-
-            auto end = std::chrono::high_resolution_clock::now();
-
-            auto duration =
-                std::chrono::duration_cast<std::chrono::microseconds>(end - start)
-                    .count();
-
-            std::cout << "KthElement1d took " << duration << " microseconds"
-                    << std::endl;
-
-            std::cout << "Found " << found << " left " << left
-                    << " less_count " << less_count
-                    << " greater_equal_count " << greater_equal_count
-                    << " num_elems " << num_elems
-                    << " nan_count " << nan_count
-                    << std::endl;
-            /* code */
         }
-        catch (sycl::exception const &e)
-        {
-            std::cout << e.what() << std::endl;
+        else {
+            num_elems = 2;
+            elems_offset = k;
         }
 
         state.cleanup(exec_queue);
+
+        auto end = std::chrono::high_resolution_clock::now();
+
+        auto duration =
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                .count();
+
+        std::cout << "KthElement1d took " << duration << " microseconds"
+                << std::endl;
         return {found, buff_offset, elems_offset, num_elems, nan_count};
     }
 };
 
 using SupportedTypes =
-    std::tuple<uint32_t, int32_t, uint64_t, int64_t, float, double>;
+    std::tuple<uint32_t, int32_t, uint64_t, int64_t, float, double, std::complex<float>, std::complex<double>>;
 } // namespace
 
 KthElement1d::KthElement1d() : dispatch_table("a")
diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
index 748a4a16f01c..8bdc2c836e98 100644
--- a/dpnp/backend/extensions/statistics/partitioning.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -256,14 +256,15 @@ void submit_partition_one_pivot(sycl::handler &cgh,
                 auto i = local_i_base + _i * sbg_size;
                 if (i < num_elems) {
                     values[_i] = _in[i];
-                    less_count += Less<T>{}(values[_i], value);
-                    equal_count += values[_i] == value;
-                    nan_count += IsNan<T>::isnan(values[_i]);
+                    auto is_nan = IsNan<T>::isnan(values[_i]);
+                    less_count += (Less<T>{}(values[_i], value) && !is_nan);
+                    equal_count += (values[_i] == value && !is_nan);
+                    nan_count += is_nan;
                     actual_count++;
                 }
             }
 
-            greater_equal_count = actual_count - less_count;
+            greater_equal_count = actual_count - less_count - nan_count;
 
             auto sbg_less_equal =
                 sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
@@ -329,21 +330,24 @@ void submit_partition_one_pivot(sycl::handler &cgh,
             uint32_t gr_item_offset = 0;
 
             for (uint32_t _i = 0; _i < WorkPI; ++_i) {
-                uint32_t less = values[_i] < value;
+                uint32_t is_nan = IsNan<T>::isnan(values[_i]);
+                uint32_t less = (!is_nan && Less<T>{}(values[_i], value));
                 auto le_pos =
                     sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
                 auto ge_pos = sbg.get_local_linear_id() - le_pos;
 
                 auto total_le =
                     sycl::reduce_over_group(sbg, less, sycl::plus<>());
-                auto total_gr = sbg_size - total_le;
+                auto total_nan =
+                    sycl::reduce_over_group(sbg, is_nan, sycl::plus<>());
+                auto total_gr = sbg_size - total_le - total_nan;
 
                 if (_i < actual_count) {
                     if (less) {
                         out[sbg_less_offset + le_item_offset + le_pos] =
                             values[_i];
                     }
-                    else {
+                    else if (!is_nan){
                         out[sbg_gr_offset + gr_item_offset + ge_pos] =
                             values[_i];
                     }
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 47fab796d7d3..ceff1f9351d6 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -261,7 +261,7 @@ def dpnp_median(
             )
         axis = -1
 
-    if not isinstance(a.dtype, dpnp.complexfloating) and not ignore_nan and a_ndim == 1:
+    if not ignore_nan and a_ndim == 1:
         return native_median(a)
 
     if overwrite_input:

From db26d6b54086f053baa86dbf497132cdaae276ed Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 25 Apr 2025 15:54:20 +0200
Subject: [PATCH 05/13] pre-commit

---
 .../extensions/statistics/kth_element1d.cpp   | 58 +++++++++++--------
 .../extensions/statistics/kth_element1d.hpp   | 22 ++++---
 .../extensions/statistics/partitioning.hpp    |  2 +-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      | 22 +++----
 4 files changed, 56 insertions(+), 48 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index fef92b4f71f6..94fd3eb24e3d 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -40,8 +40,8 @@
 #include "kth_element1d.hpp"
 #include "partitioning.hpp"
 
-#include <iostream>
 #include <chrono>
+#include <iostream>
 
 namespace sycl_exp = sycl::ext::oneapi::experimental;
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
@@ -152,7 +152,8 @@ struct KthElementF
                         auto gh = sycl_exp::group_with_scratchpad(
                             group, sycl::span{&scratch[0], temp_memory_size});
                         if (num_elems > 0)
-                            sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems], Less<T>{});
+                            sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems],
+                                                 Less<T>{});
 
                         if (group.leader()) {
                             uint64_t offset = state.counters.less_count[0];
@@ -188,7 +189,8 @@ struct KthElementF
                     auto gh = sycl_exp::group_with_scratchpad(
                         group, sycl::span{&scratch[0], temp_memory_size});
                     sycl_exp::joint_sort(gh, &loc_items[0],
-                                         &loc_items[0] + items_to_sort, Less<T>{});
+                                         &loc_items[0] + items_to_sort,
+                                         Less<T>{});
 
                     T new_pivot = loc_items[items_to_sort / 2];
 
@@ -256,7 +258,8 @@ struct KthElementF
         uint32_t limit = 4 * (items_to_sort + 1);
         uint32_t iterations =
             std::ceil(-std::log(double(state.n) / limit) / std::log(0.536)) + 1;
-        // Ensure iterations are odd so the final result is always stored in 'partitioned'
+        // Ensure iterations are odd so the final result is always stored in
+        // 'partitioned'
         iterations += 1 - iterations % 2;
 
         auto prev = run_pick_pivot(exec_q, const_cast<T *>(in), partitioned, k,
@@ -267,8 +270,8 @@ struct KthElementF
         T *_in = partitioned;
         T *_out = temp_buff;
         for (uint32_t i = 0; i < iterations - 1; ++i) {
-            prev = run_pick_pivot(exec_q, _in, _out, k, state,
-                                  items_to_sort, limit, {prev});
+            prev = run_pick_pivot(exec_q, _in, _out, k, state, items_to_sort,
+                                  limit, {prev});
             prev = run_partition(exec_q, _in, _out, pstate, {prev});
             std::swap(_in, _out);
         }
@@ -278,13 +281,12 @@ struct KthElementF
         return prev;
     }
 
-    static KthElement1d::RetT
-        impl(sycl::queue &exec_queue,
-             const void *v_ain,
-             void *v_partitioned,
-             const size_t a_size,
-             const size_t k,
-             const std::vector<sycl::event> &depends)
+    static KthElement1d::RetT impl(sycl::queue &exec_queue,
+                                   const void *v_ain,
+                                   void *v_partitioned,
+                                   const size_t a_size,
+                                   const size_t k,
+                                   const std::vector<sycl::event> &depends)
     {
         auto start = std::chrono::high_resolution_clock::now();
         const T *ain = static_cast<const T *>(v_ain);
@@ -298,9 +300,9 @@ struct KthElementF
         init_e = pstate.init(exec_queue, {init_e});
 
         auto temp_buff = dpctl_utils::smart_malloc<T>(state.n, exec_queue,
-            sycl::usm::alloc::device);
-        auto evt = run_kth_element(exec_queue, ain, partitioned, temp_buff.get(), k, state,
-                                   pstate, {init_e});
+                                                      sycl::usm::alloc::device);
+        auto evt = run_kth_element(exec_queue, ain, partitioned,
+                                   temp_buff.get(), k, state, pstate, {init_e});
 
         bool found = false;
         bool left = false;
@@ -315,7 +317,8 @@ struct KthElementF
         copy_evt = exec_queue.copy(state.counters.greater_equal_count,
                                    &greater_equal_count, 1, copy_evt);
         copy_evt = exec_queue.copy(state.num_elems, &num_elems, 1, copy_evt);
-        copy_evt = exec_queue.copy(state.counters.nan_count, &nan_count, 1, copy_evt);
+        copy_evt =
+            exec_queue.copy(state.counters.nan_count, &nan_count, 1, copy_evt);
 
         uint64_t buff_offset = 0;
         uint64_t elems_offset = less_count;
@@ -344,13 +347,19 @@ struct KthElementF
                 .count();
 
         std::cout << "KthElement1d took " << duration << " microseconds"
-                << std::endl;
+                  << std::endl;
         return {found, buff_offset, elems_offset, num_elems, nan_count};
     }
 };
 
-using SupportedTypes =
-    std::tuple<uint32_t, int32_t, uint64_t, int64_t, float, double, std::complex<float>, std::complex<double>>;
+using SupportedTypes = std::tuple<uint32_t,
+                                  int32_t,
+                                  uint64_t,
+                                  int64_t,
+                                  float,
+                                  double,
+                                  std::complex<float>,
+                                  std::complex<double>>;
 } // namespace
 
 KthElement1d::KthElement1d() : dispatch_table("a")
@@ -358,11 +367,10 @@ KthElement1d::KthElement1d() : dispatch_table("a")
     dispatch_table.populate_dispatch_table<SupportedTypes, KthElementF>();
 }
 
-KthElement1d::RetT
-    KthElement1d::call(const dpctl::tensor::usm_ndarray &a,
-                       dpctl::tensor::usm_ndarray &partitioned,
-                       const size_t k,
-                       const std::vector<sycl::event> &depends)
+KthElement1d::RetT KthElement1d::call(const dpctl::tensor::usm_ndarray &a,
+                                      dpctl::tensor::usm_ndarray &partitioned,
+                                      const size_t k,
+                                      const std::vector<sycl::event> &depends)
 {
     // validate(a, partitioned, k);
 
diff --git a/dpnp/backend/extensions/statistics/kth_element1d.hpp b/dpnp/backend/extensions/statistics/kth_element1d.hpp
index 0507206e439b..b181028bb1e9 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.hpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.hpp
@@ -34,23 +34,21 @@ namespace statistics::partitioning
 struct KthElement1d
 {
     using RetT = std::tuple<bool, uint64_t, uint64_t, uint64_t, uint64_t>;
-    using FnT = RetT (*)(
-        sycl::queue &,
-        const void *,
-        void *,
-        const size_t,
-        const size_t,
-        const std::vector<sycl::event> &);
+    using FnT = RetT (*)(sycl::queue &,
+                         const void *,
+                         void *,
+                         const size_t,
+                         const size_t,
+                         const std::vector<sycl::event> &);
 
     ext::common::DispatchTable<FnT> dispatch_table;
 
     KthElement1d();
 
-    RetT
-        call(const dpctl::tensor::usm_ndarray &a,
-             dpctl::tensor::usm_ndarray &partitioned,
-             uint64_t k,
-             const std::vector<sycl::event> &depends);
+    RetT call(const dpctl::tensor::usm_ndarray &a,
+              dpctl::tensor::usm_ndarray &partitioned,
+              uint64_t k,
+              const std::vector<sycl::event> &depends);
 };
 
 void populate_kth_element1d(py::module_ m);
diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
index 8bdc2c836e98..e64428ea9386 100644
--- a/dpnp/backend/extensions/statistics/partitioning.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -347,7 +347,7 @@ void submit_partition_one_pivot(sycl::handler &cgh,
                         out[sbg_less_offset + le_item_offset + le_pos] =
                             values[_i];
                     }
-                    else if (!is_nan){
+                    else if (!is_nan) {
                         out[sbg_gr_offset + gr_item_offset + ge_pos] =
                             values[_i];
                     }
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index ceff1f9351d6..8f4571a6f998 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -27,13 +27,12 @@
 
 import dpctl
 import dpctl.tensor as dpt
+import dpctl.utils as dpu
 from dpctl.tensor._numpy_helper import normalize_axis_tuple
 from dpctl.utils import ExecutionPlacementError
-import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
-
-import dpctl.utils as dpu
 
 import dpnp
+import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
 from dpnp.dpnp_array import dpnp_array
 
 __all__ = ["dpnp_cov", "dpnp_median"]
@@ -193,6 +192,7 @@ def dpnp_cov(
     c = dpnp.dot(x, x_t.conj()) / fact
     return c.squeeze()
 
+
 def native_median(a):
 
     partitioned = dpnp.empty_like(a)
@@ -201,14 +201,16 @@ def native_median(a):
 
     _manager = dpu.SequentialOrderManager[a.sycl_queue]
 
-    result = dpnp.empty_like(a, shape = 1)
+    result = dpnp.empty_like(a, shape=1)
     k = a.shape[0] // 2
 
-    found, buff_offset, elems_offset, num_elems, nan_count = statistics_ext.kth_element(
-        a_usm,
-        partitioned_usm,
-        k,
-        depends=_manager.submitted_events,
+    found, buff_offset, elems_offset, num_elems, nan_count = (
+        statistics_ext.kth_element(
+            a_usm,
+            partitioned_usm,
+            k,
+            depends=_manager.submitted_events,
+        )
     )
 
     if found:
@@ -218,7 +220,7 @@ def native_median(a):
         else:
             result[0] = partitioned[0]
     else:
-        partitioned[buff_offset:buff_offset + num_elems].sort()
+        partitioned[buff_offset : buff_offset + num_elems].sort()
         kth_idx = buff_offset + k - elems_offset
         if a.shape[0] % 2 == 0:
             # even number of elements

From 071f56a0ecf00897c4cf79c67fdd20a054275dc6 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Thu, 15 May 2025 15:04:43 +0200
Subject: [PATCH 06/13] small refactoring

---
 .../extensions/statistics/kth_element1d.cpp   |  14 +-
 .../extensions/statistics/partitioning.hpp    | 299 +++++++++---------
 2 files changed, 160 insertions(+), 153 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index 94fd3eb24e3d..1f8a81490591 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -232,17 +232,9 @@ struct KthElementF
     {
 
         uint32_t group_size = 128;
-        auto e = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(deps);
-
-            constexpr uint32_t WorkPI = 1; // empirically found number
-
-            auto work_range = make_ndrange(state.n, group_size, WorkPI);
-            submit_partition_one_pivot<T, WorkPI>(cgh, work_range, in, out,
-                                                  state);
-        });
-
-        return e;
+        constexpr uint32_t WorkPI = 4;
+        return run_partition_one_pivot_cpu<T, WorkPI>(
+            exec_q, in, out, state, deps, group_size);
     }
 
     static sycl::event run_kth_element(sycl::queue &exec_q,
diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
index e64428ea9386..91c0c8dfec8e 100644
--- a/dpnp/backend/extensions/statistics/partitioning.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -193,169 +193,184 @@ struct PartitionState
     }
 };
 
-template <typename T>
-class partition_one_pivot_kernel;
+template <typename T, uint32_t WorkPI>
+struct partition_one_pivot_kernel_cpu;
 
 template <typename T, uint32_t WorkPI>
-void submit_partition_one_pivot(sycl::handler &cgh,
-                                sycl::nd_range<1> work_sz,
-                                T *in,
-                                T *out,
-                                PartitionState<T> &state)
+auto partition_one_pivot_func_cpu(sycl::handler &cgh, T *in, T *out, PartitionState<T> &state)
 {
     auto loc_counters =
         sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
-    // sycl::stream str(8192, 1024, cgh);
-    cgh.parallel_for<partition_one_pivot_kernel<T>>(
-        work_sz, [=](sycl::nd_item<1> item) {
-            if (state.stop[0])
-                return;
-
-            auto group = item.get_group();
-            uint64_t items_per_group = group.get_local_range(0) * WorkPI;
-            uint64_t num_elems = state.num_elems[0];
-
-            if (group.get_group_id(0) * items_per_group >= num_elems)
-                return;
-
-            T *_in = nullptr;
-            if (state.left[0]) {
-                _in = in;
-            }
-            else {
-                _in = in + state.n - num_elems;
-            }
 
-            auto value = state.pivot[0];
+    return [=](sycl::nd_item<1> item) {
+        if (state.stop[0])
+            return;
 
-            auto sbg = item.get_sub_group();
-            uint32_t sbg_size = sbg.get_max_local_range()[0];
+        auto group = item.get_group();
+        uint64_t items_per_group = group.get_local_range(0) * WorkPI;
+        uint64_t num_elems = state.num_elems[0];
 
-            uint64_t i_base =
-                (item.get_global_linear_id() - sbg.get_local_linear_id()) *
-                WorkPI;
+        if (group.get_group_id(0) * items_per_group >= num_elems)
+            return;
 
-            if (group.leader()) {
-                loc_counters[0] = 0;
-                loc_counters[1] = 0;
-                loc_counters[2] = 0;
-            }
+        T *_in = nullptr;
+        if (state.left[0]) {
+            _in = in;
+        }
+        else {
+            _in = in + state.n - num_elems;
+        }
 
-            sycl::group_barrier(group);
-
-            uint32_t less_count = 0;
-            uint32_t equal_count = 0;
-            uint32_t greater_equal_count = 0;
-            uint32_t nan_count = 0;
-
-            T values[WorkPI];
-            uint32_t actual_count = 0;
-            uint64_t local_i_base = i_base + sbg.get_local_linear_id();
-
-            for (uint32_t _i = 0; _i < WorkPI; ++_i) {
-                auto i = local_i_base + _i * sbg_size;
-                if (i < num_elems) {
-                    values[_i] = _in[i];
-                    auto is_nan = IsNan<T>::isnan(values[_i]);
-                    less_count += (Less<T>{}(values[_i], value) && !is_nan);
-                    equal_count += (values[_i] == value && !is_nan);
-                    nan_count += is_nan;
-                    actual_count++;
-                }
-            }
+        auto value = state.pivot[0];
+
+        auto sbg = item.get_sub_group();
+        uint32_t sbg_size = sbg.get_max_local_range()[0];
+
+        uint64_t i_base =
+            (item.get_global_linear_id() - sbg.get_local_linear_id()) *
+            WorkPI;
+
+        if (group.leader()) {
+            loc_counters[0] = 0;
+            loc_counters[1] = 0;
+            loc_counters[2] = 0;
+        }
 
-            greater_equal_count = actual_count - less_count - nan_count;
-
-            auto sbg_less_equal =
-                sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
-            auto sbg_equal =
-                sycl::reduce_over_group(sbg, equal_count, sycl::plus<>());
-            auto sbg_greater = sycl::reduce_over_group(sbg, greater_equal_count,
-                                                       sycl::plus<>());
-
-            uint32_t local_less_offset = 0;
-            uint32_t local_gr_offset = 0;
-            if (sbg.leader()) {
-                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                 sycl::memory_scope::work_group>
-                    gr_less_eq(loc_counters[0]);
-                local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
-
-                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                 sycl::memory_scope::work_group>
-                    gr_eq(loc_counters[1]);
-                gr_eq += sbg_equal;
-
-                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                 sycl::memory_scope::work_group>
-                    gr_greater(loc_counters[2]);
-                local_gr_offset = gr_greater.fetch_add(sbg_greater);
+        sycl::group_barrier(group);
+
+        uint32_t less_count = 0;
+        uint32_t equal_count = 0;
+        uint32_t greater_equal_count = 0;
+        uint32_t nan_count = 0;
+
+        T values[WorkPI];
+        uint32_t actual_count = 0;
+        uint64_t local_i_base = i_base + sbg.get_local_linear_id();
+
+        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
+            auto i = local_i_base + _i * sbg_size;
+            if (i < num_elems) {
+                values[_i] = _in[i];
+                auto is_nan = IsNan<T>::isnan(values[_i]);
+                less_count += (Less<T>{}(values[_i], value) && !is_nan);
+                equal_count += (values[_i] == value && !is_nan);
+                nan_count += is_nan;
+                actual_count++;
             }
+        }
 
-            local_less_offset =
-                sycl::select_from_group(sbg, local_less_offset, 0);
-            local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0);
+        greater_equal_count = actual_count - less_count - nan_count;
+
+        auto sbg_less_equal =
+            sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
+        auto sbg_equal =
+            sycl::reduce_over_group(sbg, equal_count, sycl::plus<>());
+        auto sbg_greater = sycl::reduce_over_group(sbg, greater_equal_count,
+                                                    sycl::plus<>());
+
+        uint32_t local_less_offset = 0;
+        uint32_t local_gr_offset = 0;
+        if (sbg.leader()) {
+            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::work_group>
+                gr_less_eq(loc_counters[0]);
+            local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
+
+            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::work_group>
+                gr_eq(loc_counters[1]);
+            gr_eq += sbg_equal;
+
+            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::work_group>
+                gr_greater(loc_counters[2]);
+            local_gr_offset = gr_greater.fetch_add(sbg_greater);
+        }
 
-            sycl::group_barrier(group);
+        local_less_offset =
+            sycl::select_from_group(sbg, local_less_offset, 0);
+        local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0);
 
-            if (group.leader()) {
-                sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                 sycl::memory_scope::device>
-                    glbl_less_eq(state.iteration_counters.less_count[0]);
-                auto global_less_eq_offset =
-                    glbl_less_eq.fetch_add(loc_counters[0]);
+        sycl::group_barrier(group);
 
-                sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                 sycl::memory_scope::device>
-                    glbl_eq(state.iteration_counters.equal_count[0]);
-                glbl_eq += loc_counters[1];
+        if (group.leader()) {
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                glbl_less_eq(state.iteration_counters.less_count[0]);
+            auto global_less_eq_offset =
+                glbl_less_eq.fetch_add(loc_counters[0]);
 
-                sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                 sycl::memory_scope::device>
-                    glbl_greater(
-                        state.iteration_counters.greater_equal_count[0]);
-                auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                glbl_eq(state.iteration_counters.equal_count[0]);
+            glbl_eq += loc_counters[1];
 
-                loc_counters[0] = global_less_eq_offset;
-                loc_counters[2] = global_gr_offset;
-            }
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                glbl_greater(
+                    state.iteration_counters.greater_equal_count[0]);
+            auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
 
-            sycl::group_barrier(group);
-
-            auto sbg_less_offset = loc_counters[0] + local_less_offset;
-            auto sbg_gr_offset =
-                state.n - (loc_counters[2] + local_gr_offset + sbg_greater);
-
-            uint32_t le_item_offset = 0;
-            uint32_t gr_item_offset = 0;
-
-            for (uint32_t _i = 0; _i < WorkPI; ++_i) {
-                uint32_t is_nan = IsNan<T>::isnan(values[_i]);
-                uint32_t less = (!is_nan && Less<T>{}(values[_i], value));
-                auto le_pos =
-                    sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
-                auto ge_pos = sbg.get_local_linear_id() - le_pos;
-
-                auto total_le =
-                    sycl::reduce_over_group(sbg, less, sycl::plus<>());
-                auto total_nan =
-                    sycl::reduce_over_group(sbg, is_nan, sycl::plus<>());
-                auto total_gr = sbg_size - total_le - total_nan;
-
-                if (_i < actual_count) {
-                    if (less) {
-                        out[sbg_less_offset + le_item_offset + le_pos] =
-                            values[_i];
-                    }
-                    else if (!is_nan) {
-                        out[sbg_gr_offset + gr_item_offset + ge_pos] =
-                            values[_i];
-                    }
-                    le_item_offset += total_le;
-                    gr_item_offset += total_gr;
+            loc_counters[0] = global_less_eq_offset;
+            loc_counters[2] = global_gr_offset;
+        }
+
+        sycl::group_barrier(group);
+
+        auto sbg_less_offset = loc_counters[0] + local_less_offset;
+        auto sbg_gr_offset =
+            state.n - (loc_counters[2] + local_gr_offset + sbg_greater);
+
+        uint32_t le_item_offset = 0;
+        uint32_t gr_item_offset = 0;
+
+        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
+            uint32_t is_nan = IsNan<T>::isnan(values[_i]);
+            uint32_t less = (!is_nan && Less<T>{}(values[_i], value));
+            auto le_pos =
+                sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
+            auto ge_pos = sbg.get_local_linear_id() - le_pos;
+
+            auto total_le =
+                sycl::reduce_over_group(sbg, less, sycl::plus<>());
+            auto total_nan =
+                sycl::reduce_over_group(sbg, is_nan, sycl::plus<>());
+            auto total_gr = sbg_size - total_le - total_nan;
+
+            if (_i < actual_count) {
+                if (less) {
+                    out[sbg_less_offset + le_item_offset + le_pos] =
+                        values[_i];
                 }
+                else if (!is_nan) {
+                    out[sbg_gr_offset + gr_item_offset + ge_pos] =
+                        values[_i];
+                }
+                le_item_offset += total_le;
+                gr_item_offset += total_gr;
             }
-        });
+        }
+    };
+}
+
+template <typename T, uint32_t WorkPI>
+sycl::event run_partition_one_pivot_cpu(sycl::queue &exec_q,
+                                        T *in,
+                                        T *out,
+                                        PartitionState<T> &state,
+                                        const std::vector<sycl::event> &deps,
+                                        uint32_t group_size)
+{
+    auto e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(deps);
+
+        auto work_range = make_ndrange(state.n, group_size, WorkPI);
+
+        cgh.parallel_for<partition_one_pivot_kernel_cpu<T, WorkPI>>(
+            work_range, partition_one_pivot_func_cpu<T, WorkPI>(cgh, in, out, state));
+    });
+
+    return e;
 }
 
 } // namespace statistics::partitioning

From 29d46c2af811564b4e96cc9c04c83be262f86e80 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Thu, 15 May 2025 16:19:45 +0200
Subject: [PATCH 07/13] adding gpu-optimized version

---
 .../extensions/statistics/partitioning.hpp    | 198 ++-------------
 .../partitioning_one_pivot_kernel_cpu.hpp     | 228 ++++++++++++++++++
 .../partitioning_one_pivot_kernel_gpu.hpp     | 214 ++++++++++++++++
 3 files changed, 466 insertions(+), 174 deletions(-)
 create mode 100644 dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp
 create mode 100644 dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp

diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
index 91c0c8dfec8e..3d01990a2503 100644
--- a/dpnp/backend/extensions/statistics/partitioning.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -193,184 +193,34 @@ struct PartitionState
     }
 };
 
-template <typename T, uint32_t WorkPI>
-struct partition_one_pivot_kernel_cpu;
-
-template <typename T, uint32_t WorkPI>
-auto partition_one_pivot_func_cpu(sycl::handler &cgh, T *in, T *out, PartitionState<T> &state)
-{
-    auto loc_counters =
-        sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
-
-    return [=](sycl::nd_item<1> item) {
-        if (state.stop[0])
-            return;
-
-        auto group = item.get_group();
-        uint64_t items_per_group = group.get_local_range(0) * WorkPI;
-        uint64_t num_elems = state.num_elems[0];
-
-        if (group.get_group_id(0) * items_per_group >= num_elems)
-            return;
-
-        T *_in = nullptr;
-        if (state.left[0]) {
-            _in = in;
-        }
-        else {
-            _in = in + state.n - num_elems;
-        }
-
-        auto value = state.pivot[0];
-
-        auto sbg = item.get_sub_group();
-        uint32_t sbg_size = sbg.get_max_local_range()[0];
-
-        uint64_t i_base =
-            (item.get_global_linear_id() - sbg.get_local_linear_id()) *
-            WorkPI;
-
-        if (group.leader()) {
-            loc_counters[0] = 0;
-            loc_counters[1] = 0;
-            loc_counters[2] = 0;
-        }
-
-        sycl::group_barrier(group);
-
-        uint32_t less_count = 0;
-        uint32_t equal_count = 0;
-        uint32_t greater_equal_count = 0;
-        uint32_t nan_count = 0;
-
-        T values[WorkPI];
-        uint32_t actual_count = 0;
-        uint64_t local_i_base = i_base + sbg.get_local_linear_id();
-
-        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
-            auto i = local_i_base + _i * sbg_size;
-            if (i < num_elems) {
-                values[_i] = _in[i];
-                auto is_nan = IsNan<T>::isnan(values[_i]);
-                less_count += (Less<T>{}(values[_i], value) && !is_nan);
-                equal_count += (values[_i] == value && !is_nan);
-                nan_count += is_nan;
-                actual_count++;
-            }
-        }
-
-        greater_equal_count = actual_count - less_count - nan_count;
-
-        auto sbg_less_equal =
-            sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
-        auto sbg_equal =
-            sycl::reduce_over_group(sbg, equal_count, sycl::plus<>());
-        auto sbg_greater = sycl::reduce_over_group(sbg, greater_equal_count,
-                                                    sycl::plus<>());
-
-        uint32_t local_less_offset = 0;
-        uint32_t local_gr_offset = 0;
-        if (sbg.leader()) {
-            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::work_group>
-                gr_less_eq(loc_counters[0]);
-            local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
-
-            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::work_group>
-                gr_eq(loc_counters[1]);
-            gr_eq += sbg_equal;
-
-            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::work_group>
-                gr_greater(loc_counters[2]);
-            local_gr_offset = gr_greater.fetch_add(sbg_greater);
-        }
-
-        local_less_offset =
-            sycl::select_from_group(sbg, local_less_offset, 0);
-        local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0);
-
-        sycl::group_barrier(group);
-
-        if (group.leader()) {
-            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
-                glbl_less_eq(state.iteration_counters.less_count[0]);
-            auto global_less_eq_offset =
-                glbl_less_eq.fetch_add(loc_counters[0]);
-
-            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
-                glbl_eq(state.iteration_counters.equal_count[0]);
-            glbl_eq += loc_counters[1];
-
-            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
-                glbl_greater(
-                    state.iteration_counters.greater_equal_count[0]);
-            auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
-
-            loc_counters[0] = global_less_eq_offset;
-            loc_counters[2] = global_gr_offset;
-        }
+} // namespace statistics::partitioning
 
-        sycl::group_barrier(group);
-
-        auto sbg_less_offset = loc_counters[0] + local_less_offset;
-        auto sbg_gr_offset =
-            state.n - (loc_counters[2] + local_gr_offset + sbg_greater);
-
-        uint32_t le_item_offset = 0;
-        uint32_t gr_item_offset = 0;
-
-        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
-            uint32_t is_nan = IsNan<T>::isnan(values[_i]);
-            uint32_t less = (!is_nan && Less<T>{}(values[_i], value));
-            auto le_pos =
-                sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
-            auto ge_pos = sbg.get_local_linear_id() - le_pos;
-
-            auto total_le =
-                sycl::reduce_over_group(sbg, less, sycl::plus<>());
-            auto total_nan =
-                sycl::reduce_over_group(sbg, is_nan, sycl::plus<>());
-            auto total_gr = sbg_size - total_le - total_nan;
-
-            if (_i < actual_count) {
-                if (less) {
-                    out[sbg_less_offset + le_item_offset + le_pos] =
-                        values[_i];
-                }
-                else if (!is_nan) {
-                    out[sbg_gr_offset + gr_item_offset + ge_pos] =
-                        values[_i];
-                }
-                le_item_offset += total_le;
-                gr_item_offset += total_gr;
-            }
-        }
-    };
-}
+#include "partitioning_one_pivot_kernel_cpu.hpp"
+#include "partitioning_one_pivot_kernel_gpu.hpp"
 
-template <typename T, uint32_t WorkPI>
-sycl::event run_partition_one_pivot_cpu(sycl::queue &exec_q,
-                                        T *in,
-                                        T *out,
-                                        PartitionState<T> &state,
-                                        const std::vector<sycl::event> &deps,
-                                        uint32_t group_size)
+namespace statistics::partitioning
 {
-    auto e = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
+template <typename T>
+sycl::event run_partition_one_pivot(sycl::queue &exec_q,
+                                    T *in,
+                                    T *out,
+                                    PartitionState<T> &state,
+                                    const std::vector<sycl::event> &deps)
+{
+    auto device = exec_q.get_device();
 
-        auto work_range = make_ndrange(state.n, group_size, WorkPI);
+    if (device.is_gpu()) {
+        constexpr uint32_t WorkPI = 8;
+        constexpr uint32_t group_size = 128;
 
-        cgh.parallel_for<partition_one_pivot_kernel_cpu<T, WorkPI>>(
-            work_range, partition_one_pivot_func_cpu<T, WorkPI>(cgh, in, out, state));
-    });
+        return run_partition_one_pivot_gpu<T>(exec_q, in, out, state, deps, group_size, WorkPI);
+    }
+    else {
+        constexpr uint32_t WorkPI = 4;
+        constexpr uint32_t group_size = 128;
 
-    return e;
+        return run_partition_one_pivot_cpu<T, WorkPI>(exec_q, in, out, state,
+                                                     deps, group_size);
+    }
+}
 }
-
-} // namespace statistics::partitioning
diff --git a/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp
new file mode 100644
index 000000000000..35200dd3e4a8
--- /dev/null
+++ b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp
@@ -0,0 +1,228 @@
+//*****************************************************************************
+// Copyright (c) 2024-2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include "utils/math_utils.hpp"
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include <stdio.h>
+
+#include "ext/common.hpp"
+
+#include "partitioning.hpp"
+
+using dpctl::tensor::usm_ndarray;
+
+using ext::common::AtomicOp;
+using ext::common::IsNan;
+using ext::common::Less;
+using ext::common::make_ndrange;
+
+namespace statistics::partitioning
+{
+
+template <typename T, uint32_t WorkPI>
+struct partition_one_pivot_kernel_cpu;
+
+template <typename T, uint32_t WorkPI>
+auto partition_one_pivot_func_cpu(sycl::handler &cgh, T *in, T *out, PartitionState<T> &state)
+{
+    auto loc_counters =
+        sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
+
+    return [=](sycl::nd_item<1> item) {
+        if (state.stop[0])
+            return;
+
+        auto group = item.get_group();
+        uint64_t items_per_group = group.get_local_range(0) * WorkPI;
+        uint64_t num_elems = state.num_elems[0];
+
+        if (group.get_group_id(0) * items_per_group >= num_elems)
+            return;
+
+        T *_in = nullptr;
+        if (state.left[0]) {
+            _in = in;
+        }
+        else {
+            _in = in + state.n - num_elems;
+        }
+
+        auto value = state.pivot[0];
+
+        auto sbg = item.get_sub_group();
+        uint32_t sbg_size = sbg.get_max_local_range()[0];
+
+        uint64_t i_base =
+            (item.get_global_linear_id() - sbg.get_local_linear_id()) *
+            WorkPI;
+
+        if (group.leader()) {
+            loc_counters[0] = 0;
+            loc_counters[1] = 0;
+            loc_counters[2] = 0;
+        }
+
+        sycl::group_barrier(group);
+
+        uint32_t less_count = 0;
+        uint32_t equal_count = 0;
+        uint32_t greater_equal_count = 0;
+        uint32_t nan_count = 0;
+
+        T values[WorkPI];
+        uint32_t actual_count = 0;
+        uint64_t local_i_base = i_base + sbg.get_local_linear_id();
+
+        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
+            auto i = local_i_base + _i * sbg_size;
+            if (i < num_elems) {
+                values[_i] = _in[i];
+                auto is_nan = IsNan<T>::isnan(values[_i]);
+                less_count += (Less<T>{}(values[_i], value) && !is_nan);
+                equal_count += (values[_i] == value && !is_nan);
+                nan_count += is_nan;
+                actual_count++;
+            }
+        }
+
+        greater_equal_count = actual_count - less_count - nan_count;
+
+        auto sbg_less_equal =
+            sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
+        auto sbg_equal =
+            sycl::reduce_over_group(sbg, equal_count, sycl::plus<>());
+        auto sbg_greater = sycl::reduce_over_group(sbg, greater_equal_count,
+                                                    sycl::plus<>());
+
+        uint32_t local_less_offset = 0;
+        uint32_t local_gr_offset = 0;
+        if (sbg.leader()) {
+            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::work_group>
+                gr_less_eq(loc_counters[0]);
+            local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
+
+            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::work_group>
+                gr_eq(loc_counters[1]);
+            gr_eq += sbg_equal;
+
+            sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::work_group>
+                gr_greater(loc_counters[2]);
+            local_gr_offset = gr_greater.fetch_add(sbg_greater);
+        }
+
+        local_less_offset =
+            sycl::select_from_group(sbg, local_less_offset, 0);
+        local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0);
+
+        sycl::group_barrier(group);
+
+        if (group.leader()) {
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                glbl_less_eq(state.iteration_counters.less_count[0]);
+            auto global_less_eq_offset =
+                glbl_less_eq.fetch_add(loc_counters[0]);
+
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                glbl_eq(state.iteration_counters.equal_count[0]);
+            glbl_eq += loc_counters[1];
+
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                glbl_greater(
+                    state.iteration_counters.greater_equal_count[0]);
+            auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
+
+            loc_counters[0] = global_less_eq_offset;
+            loc_counters[2] = global_gr_offset;
+        }
+
+        sycl::group_barrier(group);
+
+        auto sbg_less_offset = loc_counters[0] + local_less_offset;
+        auto sbg_gr_offset =
+            state.n - (loc_counters[2] + local_gr_offset + sbg_greater);
+
+        uint32_t le_item_offset = 0;
+        uint32_t gr_item_offset = 0;
+
+        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
+            uint32_t is_nan = IsNan<T>::isnan(values[_i]);
+            uint32_t less = (!is_nan && Less<T>{}(values[_i], value));
+            auto le_pos =
+                sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
+            auto ge_pos = sbg.get_local_linear_id() - le_pos;
+
+            auto total_le =
+                sycl::reduce_over_group(sbg, less, sycl::plus<>());
+            auto total_nan =
+                sycl::reduce_over_group(sbg, is_nan, sycl::plus<>());
+            auto total_gr = sbg_size - total_le - total_nan;
+
+            if (_i < actual_count) {
+                if (less) {
+                    out[sbg_less_offset + le_item_offset + le_pos] =
+                        values[_i];
+                }
+                else if (!is_nan) {
+                    out[sbg_gr_offset + gr_item_offset + ge_pos] =
+                        values[_i];
+                }
+                le_item_offset += total_le;
+                gr_item_offset += total_gr;
+            }
+        }
+    };
+}
+
+template <typename T, uint32_t WorkPI>
+sycl::event run_partition_one_pivot_cpu(sycl::queue &exec_q,
+                                        T *in,
+                                        T *out,
+                                        PartitionState<T> &state,
+                                        const std::vector<sycl::event> &deps,
+                                        uint32_t group_size)
+{
+    auto e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(deps);
+
+        auto work_range = make_ndrange(state.n, group_size, WorkPI);
+
+        cgh.parallel_for<partition_one_pivot_kernel_cpu<T, WorkPI>>(
+            work_range, partition_one_pivot_func_cpu<T, WorkPI>(cgh, in, out, state));
+    });
+
+    return e;
+}
+
+} // namespace statistics::partitioning
diff --git a/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp
new file mode 100644
index 000000000000..810c37170b67
--- /dev/null
+++ b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp
@@ -0,0 +1,214 @@
+//*****************************************************************************
+// Copyright (c) 2024-2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include "utils/math_utils.hpp"
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include <stdio.h>
+
+#include "ext/common.hpp"
+
+#include "partitioning.hpp"
+
+using dpctl::tensor::usm_ndarray;
+
+using ext::common::AtomicOp;
+using ext::common::IsNan;
+using ext::common::Less;
+using ext::common::make_ndrange;
+
+namespace statistics::partitioning
+{
+
+template <typename T>
+struct partition_one_pivot_kernel_gpu;
+
+template <typename T>
+auto partition_one_pivot_func_gpu(sycl::handler &cgh, T *in, T *out, PartitionState<T> &state, uint32_t group_size, uint32_t WorkPI)
+{
+    auto loc_counters = sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
+    auto loc_global_counters = sycl::local_accessor<uint32_t, 1>(sycl::range<1>(2), cgh);
+    auto loc_items = sycl::local_accessor<T, 1>(sycl::range<1>(WorkPI*group_size), cgh);
+
+    return [=](sycl::nd_item<1> item) {
+        if (state.stop[0])
+            return;
+
+        auto group = item.get_group();
+        auto group_range = group.get_local_range(0);
+        auto llid = item.get_local_linear_id();
+        uint64_t items_per_group = group.get_local_range(0)*WorkPI;
+        uint64_t num_elems = state.num_elems[0];
+
+        if (group.get_group_id(0)*items_per_group >= num_elems)
+            return;
+
+        T* _in = nullptr;
+        if (state.left[0])
+        {
+            _in = in;
+        }
+        else
+        {
+            _in = in + state.n - num_elems;
+        }
+
+        auto value = state.pivot[0];
+
+        auto sbg = item.get_sub_group();
+
+        uint32_t sbg_size = sbg.get_max_local_range()[0];
+        uint32_t sbg_work_size = sbg_size*WorkPI;
+        uint32_t sbg_llid = sbg.get_local_linear_id();
+        uint64_t i_base = (item.get_global_linear_id() - sbg_llid)*WorkPI;
+
+        if (group.leader())
+        {
+            loc_counters[0] = 0;
+            loc_counters[1] = 0;
+            loc_counters[2] = 0;
+        }
+
+        sycl::group_barrier(group);
+
+        for (uint32_t _i = 0; _i < WorkPI; ++_i)
+        {
+            uint32_t less_count = 0;
+            uint32_t equal_count = 0;
+            uint32_t greater_equal_count = 0;
+
+            uint32_t actual_count = 0;
+            auto i = i_base + _i*sbg_size + sbg_llid;
+            uint32_t valid = i < num_elems;
+            auto val = valid ? _in[i] : 0;
+            uint32_t less = (val < value) && valid;
+            uint32_t equal = (val == value) && valid;
+
+            auto le_pos = sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
+            auto ge_pos = sbg.get_local_linear_id() - le_pos;
+            auto sbg_less_equal = sycl::reduce_over_group(sbg, less, sycl::plus<>());
+            auto sbg_equal = sycl::reduce_over_group(sbg, equal, sycl::plus<>());
+            auto tot_valid = sycl::reduce_over_group(sbg, valid, sycl::plus<>());
+            auto sbg_greater = tot_valid - sbg_less_equal;
+
+            uint32_t local_less_offset = 0;
+            uint32_t local_gr_offset = 0;
+
+            if (sbg.leader())
+            {
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::work_group> gr_less_eq(loc_counters[0]);
+                local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
+
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::work_group> gr_eq(loc_counters[1]);
+                gr_eq += sbg_equal;
+
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::work_group> gr_greater(loc_counters[2]);
+                local_gr_offset = gr_greater.fetch_add(sbg_greater);
+            }
+
+            uint32_t local_less_offset_ = sycl::select_from_group(sbg, local_less_offset, 0);
+            uint32_t local_gr_offset_ = sycl::select_from_group(sbg, local_gr_offset, 0);
+
+            if (valid)
+            {
+                if (less)
+                {
+                    uint32_t ll_offset = local_less_offset_ + le_pos;
+                    loc_items[ll_offset] = val;
+                }
+                else
+                {
+                    auto loc_gr_offset = group_range*WorkPI - local_gr_offset_ - sbg_greater + ge_pos;
+                    loc_items[loc_gr_offset] = val;
+                }
+            }
+        }
+
+        sycl::group_barrier(group);
+
+        if (group.leader())
+        {
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed, sycl::memory_scope::device> glbl_less_eq(state.iteration_counters.less_count[0]);
+            auto global_less_eq_offset = glbl_less_eq.fetch_add(loc_counters[0]);
+
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed, sycl::memory_scope::device> glbl_eq(state.iteration_counters.equal_count[0]);
+            glbl_eq += loc_counters[1];
+
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed, sycl::memory_scope::device> glbl_greater(state.iteration_counters.greater_equal_count[0]);
+            auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
+
+            loc_global_counters[0] = global_less_eq_offset;
+            loc_global_counters[1] = global_gr_offset + loc_counters[2];
+        }
+
+        sycl::group_barrier(group);
+
+        auto global_less_eq_offset = loc_global_counters[0];
+        auto global_gr_offset = state.n - loc_global_counters[1];
+
+        uint32_t sbg_id = sbg.get_group_id();
+        for (uint32_t _i = 0; _i < WorkPI; ++_i)
+        {
+            uint32_t i = sbg_id*sbg_size*WorkPI + _i*sbg_size + sbg_llid;
+            if (i < loc_counters[0])
+            {
+                out[global_less_eq_offset + i] = loc_items[i];
+            }
+            else if (i < loc_counters[0] + loc_counters[2])
+            {
+                auto global_gr_offset_ = global_gr_offset + i - loc_counters[0];
+                uint32_t local_buff_offset = WorkPI*group_range - loc_counters[2] + i - loc_counters[0];
+
+                out[global_gr_offset_] = loc_items[local_buff_offset];
+            }
+        }
+    };
+}
+
+template <typename T>
+sycl::event run_partition_one_pivot_gpu(sycl::queue &exec_q,
+                                        T *in,
+                                        T *out,
+                                        PartitionState<T> &state,
+                                        const std::vector<sycl::event> &deps,
+                                        uint32_t group_size,
+                                        uint32_t WorkPI)
+{
+    auto e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(deps);
+
+        auto work_range = make_ndrange(state.n, group_size, WorkPI);
+
+        cgh.parallel_for<partition_one_pivot_kernel_gpu<T>>(
+            work_range, partition_one_pivot_func_gpu<T>(cgh, in, out, state, group_size, WorkPI));
+    });
+
+    return e;
+}
+
+} // namespace statistics::partitioning

From f1095f1de87aac78a06c90e869033600c4d78d18 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 16 May 2025 14:47:40 +0200
Subject: [PATCH 08/13] adding nextafter for corner case

---
 .../extensions/statistics/kth_element1d.cpp   | 70 +++++++++++++------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index 1f8a81490591..4ad86439f012 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -55,6 +55,31 @@ using namespace ext::common;
 namespace
 {
 
+template <typename T>
+T NextAfter(T x)
+{
+    if constexpr (std::is_floating_point<T>::value) {
+        return sycl::nextafter(x, std::numeric_limits<T>::infinity());
+    }
+    else if constexpr (std::is_integral<T>::value) {
+        if (x < std::numeric_limits<T>::max())
+            return x + 1;
+        else
+            return x;
+    }
+    else if constexpr (type_utils::is_complex_v<T>) {
+        if (x.imag() != std::numeric_limits<T>::infinity()) {
+            return T{x.real(), NextAfter(x.imag())};
+        }
+        else if (x.real() != std::numeric_limits<T>::infinity()) {
+            return T{NextAfter(x.real()), -x.imag()};
+        }
+        else {
+            return x;
+        }
+    }
+}
+
 template <typename T>
 struct pick_pivot_kernel;
 
@@ -85,12 +110,6 @@ struct KthElementF
             auto scratch = sycl::local_accessor<std::byte, 1>(
                 sycl::range<1>(temp_memory_size), cgh);
 
-            // std::cout << "temp_memory_size: " << temp_memory_size
-            //     << " items_to_sort: " << items_to_sort
-            //     << " limit: " << limit
-            //     << " group_size: " << group_size << "\n";
-
-            // auto str = sycl::stream(8192, 1024, cgh);
             cgh.parallel_for<pick_pivot_kernel<T>>(
                 work_sz, [=](sycl::nd_item<1> item) {
                     auto group = item.get_group();
@@ -192,12 +211,12 @@ struct KthElementF
                                          &loc_items[0] + items_to_sort,
                                          Less<T>{});
 
-                    T new_pivot = loc_items[items_to_sort / 2];
+                    state.num_elems[0] = num_elems;
 
-                    if (new_pivot != state.pivot[0]) {
+                    T new_pivot = loc_items[items_to_sort / 2];
+                    if (new_pivot != state.pivot[0] && !IsNan<T>::isnan(new_pivot)) {
                         if (group.leader()) {
                             state.pivot[0] = new_pivot;
-                            state.num_elems[0] = num_elems;
                         }
                         return;
                     }
@@ -206,7 +225,7 @@ struct KthElementF
                     uint32_t index = start;
                     for (uint32_t i = start; i < items_to_sort; i += local_size)
                     {
-                        if (loc_items[i] != new_pivot) {
+                        if (loc_items[i] != new_pivot && !IsNan<T>::isnan(loc_items[i])) {
                             index = i;
                             break;
                         }
@@ -215,8 +234,22 @@ struct KthElementF
                     index = sycl::reduce_over_group(group, index,
                                                     sycl::minimum<>());
                     if (group.leader()) {
-                        state.pivot[0] = loc_items[index];
-                        state.num_elems[0] = num_elems;
+                        if (loc_items[index] != new_pivot || !IsNan<T>::isnan(loc_items[index]))
+                        {
+                            // if all values are Nan just use it as pivot
+                            // to filter out all the Nans
+                            state.pivot[0] = loc_items[index];
+                        }
+                        else {
+                            // we are going to filter out new_pivot
+                            // but we need to keep at least one since it
+                            // could be our target (but not target + 1)
+                            out[state.n - 1] = new_pivot;
+                            state.iteration_counters.greater_equal_count[0] += 1;
+                            state.counters.less_count[0] -= 1;
+                            new_pivot = NextAfter(new_pivot);
+                            state.pivot[0] = new_pivot;
+                        }
                     }
                 });
         });
@@ -280,7 +313,6 @@ struct KthElementF
                                    const size_t k,
                                    const std::vector<sycl::event> &depends)
     {
-        auto start = std::chrono::high_resolution_clock::now();
         const T *ain = static_cast<const T *>(v_ain);
         T *partitioned = static_cast<T *>(v_partitioned);
 
@@ -312,11 +344,11 @@ struct KthElementF
         copy_evt =
             exec_queue.copy(state.counters.nan_count, &nan_count, 1, copy_evt);
 
+        copy_evt.wait();
+
         uint64_t buff_offset = 0;
         uint64_t elems_offset = less_count;
 
-        copy_evt.wait();
-
         if (!found) {
             if (left) {
                 elems_offset = less_count - num_elems;
@@ -332,14 +364,6 @@ struct KthElementF
 
         state.cleanup(exec_queue);
 
-        auto end = std::chrono::high_resolution_clock::now();
-
-        auto duration =
-            std::chrono::duration_cast<std::chrono::microseconds>(end - start)
-                .count();
-
-        std::cout << "KthElement1d took " << duration << " microseconds"
-                  << std::endl;
         return {found, buff_offset, elems_offset, num_elems, nan_count};
     }
 };

From 1191433802d9de693619bf75ae21d10280b9fbed Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 16 May 2025 14:50:01 +0200
Subject: [PATCH 09/13] pre-commit

---
 .../extensions/statistics/kth_element1d.cpp   | 243 +++++++++---------
 .../extensions/statistics/partitioning.hpp    |   7 +-
 .../partitioning_one_pivot_kernel_cpu.hpp     |  42 ++-
 .../partitioning_one_pivot_kernel_gpu.hpp     | 122 +++++----
 4 files changed, 215 insertions(+), 199 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index 4ad86439f012..bbc6f9c345cd 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -110,148 +110,145 @@ struct KthElementF
             auto scratch = sycl::local_accessor<std::byte, 1>(
                 sycl::range<1>(temp_memory_size), cgh);
 
-            cgh.parallel_for<pick_pivot_kernel<T>>(
-                work_sz, [=](sycl::nd_item<1> item) {
-                    auto group = item.get_group();
-
-                    if (state.stop[0])
-                        return;
-
-                    auto llid = item.get_local_linear_id();
-                    auto local_size = item.get_group_range(0);
-
-                    uint64_t num_elems = 0;
-                    bool target_found = false;
-
-                    T *_in = nullptr;
-                    if (group.leader()) {
-                        state.update_counters();
-                        auto less_count = state.counters.less_count[0];
-                        bool left = target < less_count;
-                        state.left[0] = left;
-
-                        if (left) {
-                            _in = in;
-                            num_elems = state.iteration_counters.less_count[0];
-                            if (target + 1 == less_count) {
-                                _in[num_elems] = state.pivot[0];
-                                state.counters.less_count[0] += 1;
-                                num_elems += 1;
-                            }
-                        }
-                        else {
-                            num_elems =
-                                state.iteration_counters.greater_equal_count[0];
-                            _in = in + state.n - num_elems;
-
-                            if (target + 1 <
-                                less_count +
-                                    state.iteration_counters.equal_count[0]) {
-                                state.values[0] = state.pivot[0];
-                                state.values[1] = state.pivot[0];
-
-                                state.stop[0] = true;
-                                state.target_found[0] = true;
-                                target_found = true;
-                            }
+            cgh.parallel_for<pick_pivot_kernel<T>>(work_sz, [=](sycl::nd_item<1>
+                                                                    item) {
+                auto group = item.get_group();
+
+                if (state.stop[0])
+                    return;
+
+                auto llid = item.get_local_linear_id();
+                auto local_size = item.get_group_range(0);
+
+                uint64_t num_elems = 0;
+                bool target_found = false;
+
+                T *_in = nullptr;
+                if (group.leader()) {
+                    state.update_counters();
+                    auto less_count = state.counters.less_count[0];
+                    bool left = target < less_count;
+                    state.left[0] = left;
+
+                    if (left) {
+                        _in = in;
+                        num_elems = state.iteration_counters.less_count[0];
+                        if (target + 1 == less_count) {
+                            _in[num_elems] = state.pivot[0];
+                            state.counters.less_count[0] += 1;
+                            num_elems += 1;
                         }
-                        state.reset_iteration_counters();
                     }
+                    else {
+                        num_elems =
+                            state.iteration_counters.greater_equal_count[0];
+                        _in = in + state.n - num_elems;
 
-                    target_found =
-                        sycl::group_broadcast(group, target_found, 0);
-                    _in = sycl::group_broadcast(group, _in, 0);
-                    num_elems = sycl::group_broadcast(group, num_elems, 0);
-
-                    if (target_found) {
-                        return;
-                    }
-
-                    if (num_elems <= limit) {
-                        auto gh = sycl_exp::group_with_scratchpad(
-                            group, sycl::span{&scratch[0], temp_memory_size});
-                        if (num_elems > 0)
-                            sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems],
-                                                 Less<T>{});
-
-                        if (group.leader()) {
-                            uint64_t offset = state.counters.less_count[0];
-                            if (state.left[0]) {
-                                offset =
-                                    state.counters.less_count[0] - num_elems;
-                            }
-
-                            int64_t idx = target - offset;
-
-                            state.values[0] = _in[idx];
-                            state.values[1] = _in[idx + 1];
+                        if (target + 1 <
+                            less_count +
+                                state.iteration_counters.equal_count[0]) {
+                            state.values[0] = state.pivot[0];
+                            state.values[1] = state.pivot[0];
 
                             state.stop[0] = true;
                             state.target_found[0] = true;
+                            target_found = true;
                         }
-
-                        return;
                     }
+                    state.reset_iteration_counters();
+                }
 
-                    uint64_t step = num_elems / items_to_sort;
-                    for (uint32_t i = llid; i < items_to_sort; i += local_size)
-                    {
-                        loc_items[i] = std::numeric_limits<T>::max();
-                        uint32_t idx = i * step;
-                        if (idx < num_elems) {
-                            loc_items[i] = _in[idx];
-                        }
-                    }
+                target_found = sycl::group_broadcast(group, target_found, 0);
+                _in = sycl::group_broadcast(group, _in, 0);
+                num_elems = sycl::group_broadcast(group, num_elems, 0);
 
-                    sycl::group_barrier(group);
+                if (target_found) {
+                    return;
+                }
 
+                if (num_elems <= limit) {
                     auto gh = sycl_exp::group_with_scratchpad(
                         group, sycl::span{&scratch[0], temp_memory_size});
-                    sycl_exp::joint_sort(gh, &loc_items[0],
-                                         &loc_items[0] + items_to_sort,
-                                         Less<T>{});
+                    if (num_elems > 0)
+                        sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems],
+                                             Less<T>{});
 
-                    state.num_elems[0] = num_elems;
-
-                    T new_pivot = loc_items[items_to_sort / 2];
-                    if (new_pivot != state.pivot[0] && !IsNan<T>::isnan(new_pivot)) {
-                        if (group.leader()) {
-                            state.pivot[0] = new_pivot;
+                    if (group.leader()) {
+                        uint64_t offset = state.counters.less_count[0];
+                        if (state.left[0]) {
+                            offset = state.counters.less_count[0] - num_elems;
                         }
-                        return;
+
+                        int64_t idx = target - offset;
+
+                        state.values[0] = _in[idx];
+                        state.values[1] = _in[idx + 1];
+
+                        state.stop[0] = true;
+                        state.target_found[0] = true;
                     }
 
-                    auto start = llid + items_to_sort / 2 + 1;
-                    uint32_t index = start;
-                    for (uint32_t i = start; i < items_to_sort; i += local_size)
-                    {
-                        if (loc_items[i] != new_pivot && !IsNan<T>::isnan(loc_items[i])) {
-                            index = i;
-                            break;
-                        }
+                    return;
+                }
+
+                uint64_t step = num_elems / items_to_sort;
+                for (uint32_t i = llid; i < items_to_sort; i += local_size) {
+                    loc_items[i] = std::numeric_limits<T>::max();
+                    uint32_t idx = i * step;
+                    if (idx < num_elems) {
+                        loc_items[i] = _in[idx];
                     }
+                }
+
+                sycl::group_barrier(group);
 
-                    index = sycl::reduce_over_group(group, index,
-                                                    sycl::minimum<>());
+                auto gh = sycl_exp::group_with_scratchpad(
+                    group, sycl::span{&scratch[0], temp_memory_size});
+                sycl_exp::joint_sort(gh, &loc_items[0],
+                                     &loc_items[0] + items_to_sort, Less<T>{});
+
+                state.num_elems[0] = num_elems;
+
+                T new_pivot = loc_items[items_to_sort / 2];
+                if (new_pivot != state.pivot[0] && !IsNan<T>::isnan(new_pivot))
+                {
                     if (group.leader()) {
-                        if (loc_items[index] != new_pivot || !IsNan<T>::isnan(loc_items[index]))
-                        {
-                            // if all values are Nan just use it as pivot
-                            // to filter out all the Nans
-                            state.pivot[0] = loc_items[index];
-                        }
-                        else {
-                            // we are going to filter out new_pivot
-                            // but we need to keep at least one since it
-                            // could be our target (but not target + 1)
-                            out[state.n - 1] = new_pivot;
-                            state.iteration_counters.greater_equal_count[0] += 1;
-                            state.counters.less_count[0] -= 1;
-                            new_pivot = NextAfter(new_pivot);
-                            state.pivot[0] = new_pivot;
-                        }
+                        state.pivot[0] = new_pivot;
+                    }
+                    return;
+                }
+
+                auto start = llid + items_to_sort / 2 + 1;
+                uint32_t index = start;
+                for (uint32_t i = start; i < items_to_sort; i += local_size) {
+                    if (loc_items[i] != new_pivot &&
+                        !IsNan<T>::isnan(loc_items[i])) {
+                        index = i;
+                        break;
+                    }
+                }
+
+                index =
+                    sycl::reduce_over_group(group, index, sycl::minimum<>());
+                if (group.leader()) {
+                    if (loc_items[index] != new_pivot ||
+                        !IsNan<T>::isnan(loc_items[index])) {
+                        // if all values are Nan just use it as pivot
+                        // to filter out all the Nans
+                        state.pivot[0] = loc_items[index];
+                    }
+                    else {
+                        // we are going to filter out new_pivot
+                        // but we need to keep at least one since it
+                        // could be our target (but not target + 1)
+                        out[state.n - 1] = new_pivot;
+                        state.iteration_counters.greater_equal_count[0] += 1;
+                        state.counters.less_count[0] -= 1;
+                        new_pivot = NextAfter(new_pivot);
+                        state.pivot[0] = new_pivot;
                     }
-                });
+                }
+            });
         });
 
         return e;
@@ -266,8 +263,8 @@ struct KthElementF
 
         uint32_t group_size = 128;
         constexpr uint32_t WorkPI = 4;
-        return run_partition_one_pivot_cpu<T, WorkPI>(
-            exec_q, in, out, state, deps, group_size);
+        return run_partition_one_pivot_cpu<T, WorkPI>(exec_q, in, out, state,
+                                                      deps, group_size);
     }
 
     static sycl::event run_kth_element(sycl::queue &exec_q,
diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
index 3d01990a2503..19a4a2705895 100644
--- a/dpnp/backend/extensions/statistics/partitioning.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -213,14 +213,15 @@ sycl::event run_partition_one_pivot(sycl::queue &exec_q,
         constexpr uint32_t WorkPI = 8;
         constexpr uint32_t group_size = 128;
 
-        return run_partition_one_pivot_gpu<T>(exec_q, in, out, state, deps, group_size, WorkPI);
+        return run_partition_one_pivot_gpu<T>(exec_q, in, out, state, deps,
+                                              group_size, WorkPI);
     }
     else {
         constexpr uint32_t WorkPI = 4;
         constexpr uint32_t group_size = 128;
 
         return run_partition_one_pivot_cpu<T, WorkPI>(exec_q, in, out, state,
-                                                     deps, group_size);
+                                                      deps, group_size);
     }
 }
-}
+} // namespace statistics::partitioning
diff --git a/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp
index 35200dd3e4a8..f9ed9039c340 100644
--- a/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_cpu.hpp
@@ -49,7 +49,10 @@ template <typename T, uint32_t WorkPI>
 struct partition_one_pivot_kernel_cpu;
 
 template <typename T, uint32_t WorkPI>
-auto partition_one_pivot_func_cpu(sycl::handler &cgh, T *in, T *out, PartitionState<T> &state)
+auto partition_one_pivot_func_cpu(sycl::handler &cgh,
+                                  T *in,
+                                  T *out,
+                                  PartitionState<T> &state)
 {
     auto loc_counters =
         sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
@@ -79,8 +82,7 @@ auto partition_one_pivot_func_cpu(sycl::handler &cgh, T *in, T *out, PartitionSt
         uint32_t sbg_size = sbg.get_max_local_range()[0];
 
         uint64_t i_base =
-            (item.get_global_linear_id() - sbg.get_local_linear_id()) *
-            WorkPI;
+            (item.get_global_linear_id() - sbg.get_local_linear_id()) * WorkPI;
 
         if (group.leader()) {
             loc_counters[0] = 0;
@@ -117,50 +119,48 @@ auto partition_one_pivot_func_cpu(sycl::handler &cgh, T *in, T *out, PartitionSt
             sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
         auto sbg_equal =
             sycl::reduce_over_group(sbg, equal_count, sycl::plus<>());
-        auto sbg_greater = sycl::reduce_over_group(sbg, greater_equal_count,
-                                                    sycl::plus<>());
+        auto sbg_greater =
+            sycl::reduce_over_group(sbg, greater_equal_count, sycl::plus<>());
 
         uint32_t local_less_offset = 0;
         uint32_t local_gr_offset = 0;
         if (sbg.leader()) {
             sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::work_group>
+                             sycl::memory_scope::work_group>
                 gr_less_eq(loc_counters[0]);
             local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
 
             sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::work_group>
+                             sycl::memory_scope::work_group>
                 gr_eq(loc_counters[1]);
             gr_eq += sbg_equal;
 
             sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::work_group>
+                             sycl::memory_scope::work_group>
                 gr_greater(loc_counters[2]);
             local_gr_offset = gr_greater.fetch_add(sbg_greater);
         }
 
-        local_less_offset =
-            sycl::select_from_group(sbg, local_less_offset, 0);
+        local_less_offset = sycl::select_from_group(sbg, local_less_offset, 0);
         local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0);
 
         sycl::group_barrier(group);
 
         if (group.leader()) {
             sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
+                             sycl::memory_scope::device>
                 glbl_less_eq(state.iteration_counters.less_count[0]);
             auto global_less_eq_offset =
                 glbl_less_eq.fetch_add(loc_counters[0]);
 
             sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
+                             sycl::memory_scope::device>
                 glbl_eq(state.iteration_counters.equal_count[0]);
             glbl_eq += loc_counters[1];
 
             sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
-                glbl_greater(
-                    state.iteration_counters.greater_equal_count[0]);
+                             sycl::memory_scope::device>
+                glbl_greater(state.iteration_counters.greater_equal_count[0]);
             auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
 
             loc_counters[0] = global_less_eq_offset;
@@ -183,20 +183,17 @@ auto partition_one_pivot_func_cpu(sycl::handler &cgh, T *in, T *out, PartitionSt
                 sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
             auto ge_pos = sbg.get_local_linear_id() - le_pos;
 
-            auto total_le =
-                sycl::reduce_over_group(sbg, less, sycl::plus<>());
+            auto total_le = sycl::reduce_over_group(sbg, less, sycl::plus<>());
             auto total_nan =
                 sycl::reduce_over_group(sbg, is_nan, sycl::plus<>());
             auto total_gr = sbg_size - total_le - total_nan;
 
             if (_i < actual_count) {
                 if (less) {
-                    out[sbg_less_offset + le_item_offset + le_pos] =
-                        values[_i];
+                    out[sbg_less_offset + le_item_offset + le_pos] = values[_i];
                 }
                 else if (!is_nan) {
-                    out[sbg_gr_offset + gr_item_offset + ge_pos] =
-                        values[_i];
+                    out[sbg_gr_offset + gr_item_offset + ge_pos] = values[_i];
                 }
                 le_item_offset += total_le;
                 gr_item_offset += total_gr;
@@ -219,7 +216,8 @@ sycl::event run_partition_one_pivot_cpu(sycl::queue &exec_q,
         auto work_range = make_ndrange(state.n, group_size, WorkPI);
 
         cgh.parallel_for<partition_one_pivot_kernel_cpu<T, WorkPI>>(
-            work_range, partition_one_pivot_func_cpu<T, WorkPI>(cgh, in, out, state));
+            work_range,
+            partition_one_pivot_func_cpu<T, WorkPI>(cgh, in, out, state));
     });
 
     return e;
diff --git a/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp
index 810c37170b67..cbe0ed46e4d0 100644
--- a/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning_one_pivot_kernel_gpu.hpp
@@ -49,11 +49,19 @@ template <typename T>
 struct partition_one_pivot_kernel_gpu;
 
 template <typename T>
-auto partition_one_pivot_func_gpu(sycl::handler &cgh, T *in, T *out, PartitionState<T> &state, uint32_t group_size, uint32_t WorkPI)
+auto partition_one_pivot_func_gpu(sycl::handler &cgh,
+                                  T *in,
+                                  T *out,
+                                  PartitionState<T> &state,
+                                  uint32_t group_size,
+                                  uint32_t WorkPI)
 {
-    auto loc_counters = sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
-    auto loc_global_counters = sycl::local_accessor<uint32_t, 1>(sycl::range<1>(2), cgh);
-    auto loc_items = sycl::local_accessor<T, 1>(sycl::range<1>(WorkPI*group_size), cgh);
+    auto loc_counters =
+        sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
+    auto loc_global_counters =
+        sycl::local_accessor<uint32_t, 1>(sycl::range<1>(2), cgh);
+    auto loc_items =
+        sycl::local_accessor<T, 1>(sycl::range<1>(WorkPI * group_size), cgh);
 
     return [=](sycl::nd_item<1> item) {
         if (state.stop[0])
@@ -62,19 +70,17 @@ auto partition_one_pivot_func_gpu(sycl::handler &cgh, T *in, T *out, PartitionSt
         auto group = item.get_group();
         auto group_range = group.get_local_range(0);
         auto llid = item.get_local_linear_id();
-        uint64_t items_per_group = group.get_local_range(0)*WorkPI;
+        uint64_t items_per_group = group.get_local_range(0) * WorkPI;
         uint64_t num_elems = state.num_elems[0];
 
-        if (group.get_group_id(0)*items_per_group >= num_elems)
+        if (group.get_group_id(0) * items_per_group >= num_elems)
             return;
 
-        T* _in = nullptr;
-        if (state.left[0])
-        {
+        T *_in = nullptr;
+        if (state.left[0]) {
             _in = in;
         }
-        else
-        {
+        else {
             _in = in + state.n - num_elems;
         }
 
@@ -83,12 +89,11 @@ auto partition_one_pivot_func_gpu(sycl::handler &cgh, T *in, T *out, PartitionSt
         auto sbg = item.get_sub_group();
 
         uint32_t sbg_size = sbg.get_max_local_range()[0];
-        uint32_t sbg_work_size = sbg_size*WorkPI;
+        uint32_t sbg_work_size = sbg_size * WorkPI;
         uint32_t sbg_llid = sbg.get_local_linear_id();
-        uint64_t i_base = (item.get_global_linear_id() - sbg_llid)*WorkPI;
+        uint64_t i_base = (item.get_global_linear_id() - sbg_llid) * WorkPI;
 
-        if (group.leader())
-        {
+        if (group.leader()) {
             loc_counters[0] = 0;
             loc_counters[1] = 0;
             loc_counters[2] = 0;
@@ -96,54 +101,63 @@ auto partition_one_pivot_func_gpu(sycl::handler &cgh, T *in, T *out, PartitionSt
 
         sycl::group_barrier(group);
 
-        for (uint32_t _i = 0; _i < WorkPI; ++_i)
-        {
+        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
             uint32_t less_count = 0;
             uint32_t equal_count = 0;
             uint32_t greater_equal_count = 0;
 
             uint32_t actual_count = 0;
-            auto i = i_base + _i*sbg_size + sbg_llid;
+            auto i = i_base + _i * sbg_size + sbg_llid;
             uint32_t valid = i < num_elems;
             auto val = valid ? _in[i] : 0;
             uint32_t less = (val < value) && valid;
             uint32_t equal = (val == value) && valid;
 
-            auto le_pos = sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
+            auto le_pos =
+                sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
             auto ge_pos = sbg.get_local_linear_id() - le_pos;
-            auto sbg_less_equal = sycl::reduce_over_group(sbg, less, sycl::plus<>());
-            auto sbg_equal = sycl::reduce_over_group(sbg, equal, sycl::plus<>());
-            auto tot_valid = sycl::reduce_over_group(sbg, valid, sycl::plus<>());
+            auto sbg_less_equal =
+                sycl::reduce_over_group(sbg, less, sycl::plus<>());
+            auto sbg_equal =
+                sycl::reduce_over_group(sbg, equal, sycl::plus<>());
+            auto tot_valid =
+                sycl::reduce_over_group(sbg, valid, sycl::plus<>());
             auto sbg_greater = tot_valid - sbg_less_equal;
 
             uint32_t local_less_offset = 0;
             uint32_t local_gr_offset = 0;
 
-            if (sbg.leader())
-            {
-                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::work_group> gr_less_eq(loc_counters[0]);
+            if (sbg.leader()) {
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::work_group>
+                    gr_less_eq(loc_counters[0]);
                 local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
 
-                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::work_group> gr_eq(loc_counters[1]);
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::work_group>
+                    gr_eq(loc_counters[1]);
                 gr_eq += sbg_equal;
 
-                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::work_group> gr_greater(loc_counters[2]);
+                sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                                 sycl::memory_scope::work_group>
+                    gr_greater(loc_counters[2]);
                 local_gr_offset = gr_greater.fetch_add(sbg_greater);
             }
 
-            uint32_t local_less_offset_ = sycl::select_from_group(sbg, local_less_offset, 0);
-            uint32_t local_gr_offset_ = sycl::select_from_group(sbg, local_gr_offset, 0);
+            uint32_t local_less_offset_ =
+                sycl::select_from_group(sbg, local_less_offset, 0);
+            uint32_t local_gr_offset_ =
+                sycl::select_from_group(sbg, local_gr_offset, 0);
 
-            if (valid)
-            {
-                if (less)
-                {
+            if (valid) {
+                if (less) {
                     uint32_t ll_offset = local_less_offset_ + le_pos;
                     loc_items[ll_offset] = val;
                 }
-                else
-                {
-                    auto loc_gr_offset = group_range*WorkPI - local_gr_offset_ - sbg_greater + ge_pos;
+                else {
+                    auto loc_gr_offset = group_range * WorkPI -
+                                         local_gr_offset_ - sbg_greater +
+                                         ge_pos;
                     loc_items[loc_gr_offset] = val;
                 }
             }
@@ -151,15 +165,21 @@ auto partition_one_pivot_func_gpu(sycl::handler &cgh, T *in, T *out, PartitionSt
 
         sycl::group_barrier(group);
 
-        if (group.leader())
-        {
-            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed, sycl::memory_scope::device> glbl_less_eq(state.iteration_counters.less_count[0]);
-            auto global_less_eq_offset = glbl_less_eq.fetch_add(loc_counters[0]);
+        if (group.leader()) {
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device>
+                glbl_less_eq(state.iteration_counters.less_count[0]);
+            auto global_less_eq_offset =
+                glbl_less_eq.fetch_add(loc_counters[0]);
 
-            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed, sycl::memory_scope::device> glbl_eq(state.iteration_counters.equal_count[0]);
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device>
+                glbl_eq(state.iteration_counters.equal_count[0]);
             glbl_eq += loc_counters[1];
 
-            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed, sycl::memory_scope::device> glbl_greater(state.iteration_counters.greater_equal_count[0]);
+            sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device>
+                glbl_greater(state.iteration_counters.greater_equal_count[0]);
             auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
 
             loc_global_counters[0] = global_less_eq_offset;
@@ -172,17 +192,16 @@ auto partition_one_pivot_func_gpu(sycl::handler &cgh, T *in, T *out, PartitionSt
         auto global_gr_offset = state.n - loc_global_counters[1];
 
         uint32_t sbg_id = sbg.get_group_id();
-        for (uint32_t _i = 0; _i < WorkPI; ++_i)
-        {
-            uint32_t i = sbg_id*sbg_size*WorkPI + _i*sbg_size + sbg_llid;
-            if (i < loc_counters[0])
-            {
+        for (uint32_t _i = 0; _i < WorkPI; ++_i) {
+            uint32_t i = sbg_id * sbg_size * WorkPI + _i * sbg_size + sbg_llid;
+            if (i < loc_counters[0]) {
                 out[global_less_eq_offset + i] = loc_items[i];
             }
-            else if (i < loc_counters[0] + loc_counters[2])
-            {
+            else if (i < loc_counters[0] + loc_counters[2]) {
                 auto global_gr_offset_ = global_gr_offset + i - loc_counters[0];
-                uint32_t local_buff_offset = WorkPI*group_range - loc_counters[2] + i - loc_counters[0];
+                uint32_t local_buff_offset = WorkPI * group_range -
+                                             loc_counters[2] + i -
+                                             loc_counters[0];
 
                 out[global_gr_offset_] = loc_items[local_buff_offset];
             }
@@ -205,7 +224,8 @@ sycl::event run_partition_one_pivot_gpu(sycl::queue &exec_q,
         auto work_range = make_ndrange(state.n, group_size, WorkPI);
 
         cgh.parallel_for<partition_one_pivot_kernel_gpu<T>>(
-            work_range, partition_one_pivot_func_gpu<T>(cgh, in, out, state, group_size, WorkPI));
+            work_range, partition_one_pivot_func_gpu<T>(cgh, in, out, state,
+                                                        group_size, WorkPI));
     });
 
     return e;

From e2da2e0abd011d79e8dc64b18395c686a3f6f7b6 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 16 May 2025 17:44:47 +0200
Subject: [PATCH 10/13] fixing case when input array is too small

---
 .../extensions/statistics/kth_element1d.cpp      | 16 +++++++++++-----
 dpnp/dpnp_utils/dpnp_utils_statistics.py         |  1 -
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index bbc6f9c345cd..c317c613298e 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -278,11 +278,17 @@ struct KthElementF
     {
         uint32_t items_to_sort = 127;
         uint32_t limit = 4 * (items_to_sort + 1);
-        uint32_t iterations =
-            std::ceil(-std::log(double(state.n) / limit) / std::log(0.536)) + 1;
-        // Ensure iterations are odd so the final result is always stored in
-        // 'partitioned'
-        iterations += 1 - iterations % 2;
+
+        uint32_t iterations = 1;
+
+        if (state.n > limit) {
+            iterations = std::ceil(
+                -std::log(double(state.n) / limit) / std::log(0.536)) + 1;
+
+            // Ensure iterations are odd so the final result is always stored in
+            // 'partitioned'
+            iterations += 1 - iterations % 2;
+        }
 
         auto prev = run_pick_pivot(exec_q, const_cast<T *>(in), partitioned, k,
                                    state, items_to_sort, limit, depends);
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 8f4571a6f998..e4314e50f84f 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -194,7 +194,6 @@ def dpnp_cov(
 
 
 def native_median(a):
-
     partitioned = dpnp.empty_like(a)
     a_usm = dpnp.get_usm_ndarray(a)
     partitioned_usm = dpnp.get_usm_ndarray(partitioned)

From 07c3a4afa9298eeca8e3b1c8997d946b20d47699 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Wed, 28 May 2025 17:04:42 +0200
Subject: [PATCH 11/13] Cover corner case for small inputs

---
 dpnp/backend/extensions/common/ext/common.hpp | 29 +++---
 .../extensions/statistics/kth_element1d.cpp   | 90 ++++++++++++++++++-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      | 55 ++++++++++--
 dpnp/tests/test_statistics.py                 | 23 +----
 4 files changed, 154 insertions(+), 43 deletions(-)

diff --git a/dpnp/backend/extensions/common/ext/common.hpp b/dpnp/backend/extensions/common/ext/common.hpp
index 080df62b25e3..d56ac80d5d78 100644
--- a/dpnp/backend/extensions/common/ext/common.hpp
+++ b/dpnp/backend/extensions/common/ext/common.hpp
@@ -70,20 +70,6 @@ struct AtomicOp
     }
 };
 
-template <typename T>
-struct Less
-{
-    bool operator()(const T &lhs, const T &rhs) const
-    {
-        if constexpr (type_utils::is_complex_v<T>) {
-            return dpctl::tensor::math_utils::less_complex(lhs, rhs);
-        }
-        else {
-            return std::less{}(lhs, rhs);
-        }
-    }
-};
-
 template <typename T>
 struct IsNan
 {
@@ -106,6 +92,21 @@ struct IsNan
     }
 };
 
+template <typename T>
+struct Less
+{
+    bool operator()(const T &lhs, const T &rhs) const
+    {
+        if constexpr (type_utils::is_complex_v<T>) {
+            return IsNan<T>::isnan(rhs) ||
+                   dpctl::tensor::math_utils::less_complex(lhs, rhs);
+        }
+        else {
+            return IsNan<T>::isnan(rhs) || std::less{}(lhs, rhs);
+        }
+    }
+};
+
 template <typename T, bool hasValueType>
 struct value_type_of_impl;
 
diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index c317c613298e..7c53d7509752 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -83,9 +83,89 @@ T NextAfter(T x)
 template <typename T>
 struct pick_pivot_kernel;
 
+template <typename T>
+struct kth_sorter_kernel;
+
 template <typename T>
 struct KthElementF
 {
+    static std::tuple<bool, sycl::event>
+        run_kth_sort(sycl::queue &exec_q,
+                     const T *in,
+                     const size_t k,
+                     State<T> &state,
+                     const std::vector<sycl::event> &depends)
+    {
+        auto device = exec_q.get_device();
+        size_t local_mem_size = get_local_mem_size_in_bytes(device);
+        size_t temp_memory_size =
+            sycl_exp::default_sorters::joint_sorter<>::memory_required<T>(
+                sycl::memory_scope::work_group, state.n);
+        size_t loc_items_mem = sizeof(T) * state.n;
+
+        if ((temp_memory_size + loc_items_mem) > local_mem_size)
+            return {false, sycl::event{}};
+
+        auto e = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            const uint32_t local_size = get_max_local_size(exec_q);
+            const uint32_t WorkPI = CeilDiv(state.n, local_size);
+            auto work_sz = make_ndrange(state.n, local_size, WorkPI);
+            auto loc_items =
+                sycl::local_accessor<T, 1>(sycl::range<1>(state.n), cgh);
+            auto scratch = sycl::local_accessor<std::byte, 1>(
+                sycl::range<1>(temp_memory_size), cgh);
+
+            cgh.parallel_for<kth_sorter_kernel<T>>(
+                work_sz, [=](sycl::nd_item<1> item) {
+                    auto group = item.get_group();
+                    auto sbg = item.get_sub_group();
+
+                    if (state.stop[0])
+                        return;
+
+                    auto llid = item.get_local_linear_id();
+                    uint32_t sbg_size = sbg.get_max_local_range()[0];
+                    uint32_t sbg_llid = sbg.get_local_linear_id();
+                    auto local_size = item.get_group_range(0);
+                    uint32_t nan_count = 0;
+
+                    uint32_t i_base =
+                        sbg.get_group_id() * WorkPI * sbg_size + sbg_llid;
+                    for (uint32_t i = 0; i < WorkPI; i++) {
+                        uint32_t idx = i_base + i * sbg_size;
+                        if (idx < state.n) {
+                            loc_items[idx] = in[idx];
+                            if (IsNan<T>::isnan(in[idx])) {
+                                nan_count++;
+                            }
+                        }
+                    }
+
+                    nan_count = sycl::reduce_over_group(group, nan_count,
+                                                        sycl::plus<>());
+                    sycl::group_barrier(group);
+
+                    auto gh = sycl_exp::group_with_scratchpad(
+                        group, sycl::span{&scratch[0], temp_memory_size});
+                    sycl_exp::joint_sort(gh, &loc_items[0],
+                                         &loc_items[0] + state.n, Less<T>{});
+
+                    sycl::group_barrier(group);
+
+                    if (group.leader()) {
+                        state.values[0] = loc_items[k];
+                        state.values[1] = loc_items[k + 1];
+                        state.target_found[0] = true;
+                        state.counters.nan_count[0] = nan_count;
+                    }
+                });
+        });
+
+        return {true, e};
+    }
+
     static sycl::event run_pick_pivot(sycl::queue &queue,
                                       T *in,
                                       T *out,
@@ -276,14 +356,20 @@ struct KthElementF
                                        PartitionState<T> &pstate,
                                        const std::vector<sycl::event> &depends)
     {
+        auto [success, evt] = run_kth_sort(exec_q, in, k, state, depends);
+        if (success) {
+            return evt;
+        }
+
         uint32_t items_to_sort = 127;
         uint32_t limit = 4 * (items_to_sort + 1);
 
         uint32_t iterations = 1;
 
         if (state.n > limit) {
-            iterations = std::ceil(
-                -std::log(double(state.n) / limit) / std::log(0.536)) + 1;
+            iterations = std::ceil(-std::log(double(state.n) / limit) /
+                                   std::log(0.536)) +
+                         1;
 
             // Ensure iterations are odd so the final result is always stored in
             // 'partitioned'
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index e4314e50f84f..98cafefd5421 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -34,6 +34,7 @@
 import dpnp
 import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
 from dpnp.dpnp_array import dpnp_array
+from dpnp.dpnp_utils.dpnp_utils_common import to_supported_dtypes
 
 __all__ = ["dpnp_cov", "dpnp_median"]
 
@@ -193,15 +194,46 @@ def dpnp_cov(
     return c.squeeze()
 
 
-def native_median(a):
-    partitioned = dpnp.empty_like(a)
-    a_usm = dpnp.get_usm_ndarray(a)
+def native_median(a, ignore_nan):
+    a = dpnp.reshape(a, a.size)
+    device = a.sycl_device
+
+    result_dtype = dpnp.default_float_type()
+    if dpnp.issubdtype(a.dtype, dpnp.complexfloating):
+        result_dtype = a.dtype
+
+    if a.size == 0:
+        return dpnp.array(dpnp.nan, ndmin=1, dtype=result_dtype)
+    elif a.size == 1:
+        return dpnp.array(a[0], ndmin=1, dtype=result_dtype)
+
+    supported_types = statistics_ext.kth_element_dtypes()
+    supported_dtype = to_supported_dtypes(a.dtype, supported_types, device)
+
+    if supported_dtype is None:  # pragma: no cover
+        raise ValueError(
+            f"function does not support input type "
+            f"{a.dtype.name}, "
+            "and the input could not be coerced to any "
+            f"supported type. List of supported types: "
+            f"{[st.name for st in supported_types]}"
+        )
+
+    a_casted = dpnp.asarray(a, dtype=supported_dtype, order="C")
+
+    partitioned = dpnp.empty_like(a_casted)
+
+    a_usm = dpnp.get_usm_ndarray(a_casted)
     partitioned_usm = dpnp.get_usm_ndarray(partitioned)
 
     _manager = dpu.SequentialOrderManager[a.sycl_queue]
 
-    result = dpnp.empty_like(a, shape=1)
-    k = a.shape[0] // 2
+    result = dpnp.empty_like(a, dtype=result_dtype, shape=1)
+
+    nans = 0
+    if ignore_nan:
+        nans = dpnp.isnan(a_usm).sum()
+    k = (a.shape[0] - 1 - nans) // 2
 
     found, buff_offset, elems_offset, num_elems, nan_count = (
         statistics_ext.kth_element(
@@ -212,6 +244,9 @@ def native_median(a):
         )
     )
 
+    if not ignore_nan and nan_count > 0:
+        return dpnp.array(dpnp.nan, ndmin=1, dtype=result_dtype)
+
     if found:
         if a.shape[0] % 2 == 0:
             # even number of elements
@@ -240,6 +275,13 @@ def dpnp_median(
 ):
     """Compute the median of an array along a specified axis."""
 
+    if axis is None or a.ndim == 1:
+        result = native_median(a, ignore_nan)
+        if not keepdims:
+            return result[0]
+
+        return result.reshape((1,) * a.ndim)
+
     a_ndim = a.ndim
     a_shape = a.shape
     _axis = range(a_ndim) if axis is None else axis
@@ -262,9 +304,6 @@ def dpnp_median(
             )
         axis = -1
 
-    if not ignore_nan and a_ndim == 1:
-        return native_median(a)
-
     if overwrite_input:
         if isinstance(a, dpt.usm_ndarray):
             # dpnp.ndarray.sort only works with dpnp_array
diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py
index cf436087b607..f41fd6e850b8 100644
--- a/dpnp/tests/test_statistics.py
+++ b/dpnp/tests/test_statistics.py
@@ -915,6 +915,7 @@ def test_basic(self, dtype, size):
         a = generate_random_numpy_array(size, dtype)
         ia = dpnp.array(a)
 
+        # import pdb; pdb.set_trace()
         expected = numpy.median(a)
         result = dpnp.median(ia)
         assert_dtype_allclose(result, expected)
@@ -979,25 +980,6 @@ def test_nan(self, axis, keepdims):
 
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("axis", [None, 0, -1, (0, -2, -1)])
-    @pytest.mark.parametrize("keepdims", [True, False])
-    def test_overwrite_input(self, axis, keepdims):
-        a = generate_random_numpy_array((2, 3, 4))
-        ia = dpnp.array(a)
-
-        b = a.copy()
-        ib = ia.copy()
-        expected = numpy.median(
-            b, axis=axis, keepdims=keepdims, overwrite_input=True
-        )
-        result = dpnp.median(
-            ib, axis=axis, keepdims=keepdims, overwrite_input=True
-        )
-        assert not numpy.all(a == b)
-        assert not dpnp.all(ia == ib)
-
-        assert_dtype_allclose(result, expected)
-
     @pytest.mark.parametrize("axis", [None, 0, (-1,), [0, 1]])
     @pytest.mark.parametrize("overwrite_input", [True, False])
     def test_usm_ndarray(self, axis, overwrite_input):
@@ -1008,6 +990,9 @@ def test_usm_ndarray(self, axis, overwrite_input):
         result = dpnp.median(ia, axis=axis, overwrite_input=overwrite_input)
         assert_dtype_allclose(result, expected)
 
+        if not overwrite_input:
+            assert_dtype_allclose(ia, a)
+
 
 class TestPtp:
     @pytest.mark.parametrize("axis", [None, 0, 1])

From b809fc0f15491fbeb55dd9c9149349cce7b8bebb Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 30 May 2025 15:04:11 +0200
Subject: [PATCH 12/13] Add validation in c++

---
 .../extensions/statistics/CMakeLists.txt      |  1 +
 .../extensions/statistics/kth_element1d.cpp   |  2 +-
 .../extensions/statistics/partitioning.cpp    | 69 +++++++++++++++++++
 .../extensions/statistics/partitioning.hpp    |  4 ++
 4 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 dpnp/backend/extensions/statistics/partitioning.cpp

diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 4d95669d4464..9fa56d52dcd0 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -31,6 +31,7 @@ set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/histogramdd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/histogram_common.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kth_element1d.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/partitioning.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sliding_dot_product1d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sliding_window1d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/statistics_py.cpp
diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp
index 7c53d7509752..c94fed4f8b7d 100644
--- a/dpnp/backend/extensions/statistics/kth_element1d.cpp
+++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp
@@ -477,7 +477,7 @@ KthElement1d::RetT KthElement1d::call(const dpctl::tensor::usm_ndarray &a,
                                       const size_t k,
                                       const std::vector<sycl::event> &depends)
 {
-    // validate(a, partitioned, k);
+    validate(a, partitioned, k);
 
     const int a_typenum = a.get_typenum();
     auto kth_elem_func = dispatch_table.get(a_typenum);
diff --git a/dpnp/backend/extensions/statistics/partitioning.cpp b/dpnp/backend/extensions/statistics/partitioning.cpp
new file mode 100644
index 000000000000..64a5d3c3ec85
--- /dev/null
+++ b/dpnp/backend/extensions/statistics/partitioning.cpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright (c) 2024-2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <string>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include "utils/type_dispatch.hpp"
+#include <pybind11/pybind11.h>
+
+#include "sliding_window1d.hpp"
+#include "ext/common.hpp"
+#include "ext/validation_utils.hpp"
+
+namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
+using namespace ext::common;
+using namespace ext::validation;
+
+using dpctl::tensor::usm_ndarray;
+using dpctl_td_ns::typenum_t;
+
+namespace statistics::partitioning
+{
+
+void validate(const usm_ndarray &a,
+              const usm_ndarray &partitioned,
+              const size_t k)
+{
+    array_names names = {
+        {&a, "a"},
+        {&partitioned, "partitioned"}
+    };
+
+    common_checks({&a}, {&partitioned}, names);
+    check_same_size(&a, &partitioned, names);
+    check_num_dims(&a, 1, names);
+    check_num_dims(&partitioned, 1, names);
+    check_same_dtype(&a, &partitioned, names);
+
+    if (k > a.get_size() - 2) {
+        throw py::value_error("'k' must be from 0 to a.size() - 2, "
+                              "but got k = " + std::to_string(k) +
+                              " and a.size() = " + std::to_string(a.get_size()));
+    }
+}
+
+} // namespace statistics::partitioning
diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp
index 19a4a2705895..9c58ae7308c3 100644
--- a/dpnp/backend/extensions/statistics/partitioning.hpp
+++ b/dpnp/backend/extensions/statistics/partitioning.hpp
@@ -224,4 +224,8 @@ sycl::event run_partition_one_pivot(sycl::queue &exec_q,
                                                       deps, group_size);
     }
 }
+
+void validate(const usm_ndarray &a,
+              const usm_ndarray &partitioned,
+              const size_t k);
 } // namespace statistics::partitioning

From 6dc582ef84425c32d99585d8aad3a5cbb4b45d60 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 30 May 2025 15:36:31 +0200
Subject: [PATCH 13/13] pre-commit

---
 dpnp/backend/extensions/statistics/partitioning.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/dpnp/backend/extensions/statistics/partitioning.cpp b/dpnp/backend/extensions/statistics/partitioning.cpp
index 64a5d3c3ec85..abd8bd69cefe 100644
--- a/dpnp/backend/extensions/statistics/partitioning.cpp
+++ b/dpnp/backend/extensions/statistics/partitioning.cpp
@@ -30,9 +30,9 @@
 #include "utils/type_dispatch.hpp"
 #include <pybind11/pybind11.h>
 
-#include "sliding_window1d.hpp"
 #include "ext/common.hpp"
 #include "ext/validation_utils.hpp"
+#include "sliding_window1d.hpp"
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 using namespace ext::common;
@@ -48,10 +48,7 @@ void validate(const usm_ndarray &a,
               const usm_ndarray &partitioned,
               const size_t k)
 {
-    array_names names = {
-        {&a, "a"},
-        {&partitioned, "partitioned"}
-    };
+    array_names names = {{&a, "a"}, {&partitioned, "partitioned"}};
 
     common_checks({&a}, {&partitioned}, names);
     check_same_size(&a, &partitioned, names);
@@ -61,8 +58,9 @@ void validate(const usm_ndarray &a,
 
     if (k > a.get_size() - 2) {
         throw py::value_error("'k' must be from 0 to a.size() - 2, "
-                              "but got k = " + std::to_string(k) +
-                              " and a.size() = " + std::to_string(a.get_size()));
+                              "but got k = " +
+                              std::to_string(k) + " and a.size() = " +
+                              std::to_string(a.get_size()));
     }
 }