diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 7aa4ea74e7ac07..7475a5585842ae 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -71,7 +71,7 @@ unset(_ov_download_tbb_done CACHE) # or ENABLE_SYSTEM_TBB is OFF # function(ov_download_tbb) - if(_ov_download_tbb_done OR NOT THREADING MATCHES "^(TBB|TBB_AUTO)$") + if(_ov_download_tbb_done OR NOT THREADING MATCHES "^(TBB|TBB_AUTO|TBB_ADAPTIVE)$") return() endif() set(_ov_download_tbb_done ON CACHE INTERNAL "Whether prebuilt TBB is already downloaded") diff --git a/cmake/features.cmake b/cmake/features.cmake index ebeaca5d3df7de..89313647cb703f 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -78,10 +78,15 @@ ov_dependent_option (ENABLE_PKGCONFIG_GEN "Enable openvino.pc pkg-config file ge # OpenVINO Runtime specific options # -# "OneDNN library based on OMP or TBB or Sequential implementation: TBB|OMP|SEQ" -set(THREADING_DEFAULT "TBB") +# "OneDNN library based on OMP or TBB or Sequential implementation: TBB|OMP|SEQ|TBB_ADAPTIVE" +if(AARCH64) + set(THREADING_DEFAULT "TBB") +else() + set(THREADING_DEFAULT "TBB_ADAPTIVE") +endif() + -set(THREADING_OPTIONS "TBB" "TBB_AUTO" "SEQ" "OMP") +set(THREADING_OPTIONS "TBB" "TBB_AUTO" "SEQ" "OMP" "TBB_ADAPTIVE") set(THREADING "${THREADING_DEFAULT}" CACHE STRING "Threading") set_property(CACHE THREADING PROPERTY STRINGS ${THREADING_OPTIONS}) @@ -99,7 +104,7 @@ endif() ov_dependent_option (ENABLE_INTEL_OPENMP "Enables usage of Intel OpenMP instead of default compiler one" ${ENABLE_INTEL_OPENMP_DEFAULT} "THREADING STREQUAL OMP" OFF) -if((THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO") AND +if((THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE") AND (BUILD_SHARED_LIBS OR (LINUX AND X86_64))) set(ENABLE_TBBBIND_2_5_DEFAULT ON) else() diff --git a/cmake/templates/OpenVINOConfig.cmake.in b/cmake/templates/OpenVINOConfig.cmake.in index 448ac9017e4ad9..7c0dd41aefd7fb 100644 --- a/cmake/templates/OpenVINOConfig.cmake.in +++ b/cmake/templates/OpenVINOConfig.cmake.in @@ -169,7 +169,7 @@ endmacro() macro(_ov_find_tbb) set(_ov_threading "@THREADING@") - if(_ov_threading STREQUAL "TBB" OR _ov_threading STREQUAL "TBB_AUTO") + if(_ov_threading STREQUAL "TBB" OR _ov_threading STREQUAL "TBB_AUTO" OR _ov_threading STREQUAL "TBB_ADAPTIVE") set(enable_pkgconfig_tbb "@tbb_FOUND@") # try tbb.pc @@ -563,7 +563,7 @@ if(_ov_as_external_package) # WA for cmake version < 3.16 which does not export # IMPORTED_LINK_DEPENDENT_LIBRARIES_** properties if no PUBLIC dependencies for the library - if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO") + if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE") foreach(type RELEASE DEBUG RELWITHDEBINFO MINSIZEREL) foreach(tbb_target TBB::tbb TBB::tbbmalloc PkgConfig::tbb) if(TARGET ${tbb_target}) diff --git a/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py b/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py index cd3e8d953a4395..3cb0f76e552681 100644 --- a/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py +++ b/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py @@ -2,6 +2,9 @@ # Copyright (C) 2018-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +# Enums +from openvino._pyopenvino.properties.intel_cpu import TbbPartitioner + # Properties import openvino._pyopenvino.properties.intel_cpu as __intel_cpu from openvino.properties._properties import __make_properties diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index 5d194946bde609..81960cdefcbaa4 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -107,11 +107,16 @@ void regmodule_properties(py::module m) { py::module m_intel_cpu = m_properties.def_submodule("intel_cpu", "openvino.properties.intel_cpu submodule that simulates ov::intel_cpu"); + py::enum_(m_intel_cpu, "TbbPartitioner", py::arithmetic()) + .value("STATIC", ov::intel_cpu::TbbPartitioner::STATIC) + .value("AUTO", ov::intel_cpu::TbbPartitioner::AUTO); + // Submodule intel_cpu property wrap_property_RW(m_intel_cpu, ov::intel_cpu::denormals_optimization, "denormals_optimization"); wrap_property_RW(m_intel_cpu, ov::intel_cpu::sparse_weights_decompression_rate, "sparse_weights_decompression_rate"); + wrap_property_RW(m_intel_cpu, ov::intel_cpu::tbb_partitioner, "tbb_partitioner"); // Submodule intel_gpu py::module m_intel_gpu = diff --git a/src/bindings/python/src/pyopenvino/utils/utils.cpp b/src/bindings/python/src/pyopenvino/utils/utils.cpp index dde59a79a8041b..8a233317a9df94 100644 --- a/src/bindings/python/src/pyopenvino/utils/utils.cpp +++ b/src/bindings/python/src/pyopenvino/utils/utils.cpp @@ -18,6 +18,7 @@ #include "openvino/core/meta_data.hpp" #include "openvino/frontend/decoder.hpp" #include "openvino/frontend/graph_iterator.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" #include "openvino/runtime/properties.hpp" using Version = ov::pass::Serialize::Version; @@ -245,6 +246,8 @@ py::object from_ov_any(const ov::Any& any) { return py::cast(any.as()); } else if (any.is()) { return py::cast(any.as()); + } else if (any.is()) { + return py::cast(any.as()); } else if (any.is()) { return py::cast(any.as()); } else if (any.is()) { @@ -544,6 +547,8 @@ ov::Any py_object_to_any(const py::object& py_obj) { return py::cast(py_obj); } else if (py::isinstance(py_obj)) { return py::cast(py_obj); + } else if (py::isinstance(py_obj)) { + return py::cast(py_obj); } else if (py::isinstance(py_obj)) { return py::cast(py_obj); } else if (py::isinstance(py_obj)) { diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 266c8690b78231..61f08acef3ccea 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -116,6 +116,13 @@ def test_properties_rw_base(): (log.Level.TRACE, "Level.TRACE", 4), ), ), + ( + intel_cpu.TbbPartitioner, + ( + (intel_cpu.TbbPartitioner.STATIC, "TbbPartitioner.STATIC", 1), + (intel_cpu.TbbPartitioner.AUTO, "TbbPartitioner.AUTO", 2), + ), + ), ( intel_auto.SchedulePolicy, ( @@ -367,6 +374,14 @@ def test_properties_ro(ov_property_ro, expected_value): (2.0, 2.0), ), ), + ( + intel_cpu.tbb_partitioner, + "TBB_PARTITIONER", + ( + (intel_cpu.TbbPartitioner.STATIC, intel_cpu.TbbPartitioner.STATIC), + (intel_cpu.TbbPartitioner.AUTO, intel_cpu.TbbPartitioner.AUTO), + ), + ), ( intel_auto.device_bind_buffer, "DEVICE_BIND_BUFFER", diff --git a/src/cmake/install_tbb.cmake b/src/cmake/install_tbb.cmake index dc126165ba77c4..a9a909b6d02d3c 100644 --- a/src/cmake/install_tbb.cmake +++ b/src/cmake/install_tbb.cmake @@ -66,7 +66,7 @@ unset(_ov_dynamic_tbbbind_2_5_found) # install TBB # define variables for OpenVINOConfig.cmake -if(THREADING MATCHES "^(TBB|TBB_AUTO)$") +if(THREADING MATCHES "^(TBB|TBB_AUTO|TBB_ADAPTIVE)$") set(OV_TBB_DIR "${TBB_DIR}") list(APPEND PATH_VARS "OV_TBB_DIR") endif() @@ -80,7 +80,7 @@ endif() # - downloaded TBB should be a part of all packages # - custom TBB provided by users, needs to be a part of wheel packages # - system TBB also needs to be a part of wheel packages -if(THREADING MATCHES "^(TBB|TBB_AUTO)$" AND +if(THREADING MATCHES "^(TBB|TBB_AUTO|TBB_ADAPTIVE)$" AND ( (DEFINED TBBROOT AND TBBROOT MATCHES ${TEMP}) OR (DEFINED TBBROOT OR DEFINED TBB_DIR OR DEFINED ENV{TBBROOT} OR DEFINED ENV{TBB_DIR}) OR ENABLE_SYSTEM_TBB ) ) diff --git a/src/cmake/ov_parallel.cmake b/src/cmake/ov_parallel.cmake index cdb29b0aa37868..509d243a0da542 100644 --- a/src/cmake/ov_parallel.cmake +++ b/src/cmake/ov_parallel.cmake @@ -76,7 +76,7 @@ function(_ov_get_tbb_location tbb_target _tbb_lib_location_var) endfunction() macro(ov_find_package_tbb) - if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" AND NOT TBB_FOUND) + if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE" AND NOT TBB_FOUND) # conan generates TBBConfig.cmake files, which follows cmake's # SameMajorVersion scheme, while TBB itself follows AnyNewerVersion one # see https://cmake.org/cmake/help/latest/module/CMakePackageConfigHelpers.html#generating-a-package-version-file @@ -340,7 +340,7 @@ macro(ov_find_package_openmp) endmacro() function(ov_set_threading_interface_for TARGET_NAME) - if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" AND NOT TBB_FOUND) + if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE" AND NOT TBB_FOUND) # find TBB ov_find_package_tbb() @@ -383,9 +383,13 @@ function(ov_set_threading_interface_for TARGET_NAME) add_library(openvino::threading ALIAS openvino_threading) endif() - if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO") + if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE") if(TBB_FOUND) - set(_ov_thread_define "OV_THREAD_TBB") + if(THREADING STREQUAL "TBB_ADAPTIVE") + set(_ov_thread_define "OV_THREAD_TBB_ADAPTIVE") + else() + set(_ov_thread_define "OV_THREAD_TBB") + endif() set(_ov_threading_lib TBB::tbb) else() set(THREADING "SEQ" PARENT_SCOPE) diff --git a/src/core/include/openvino/core/parallel.hpp b/src/core/include/openvino/core/parallel.hpp index 38cb05ff9b6072..2fe54c2286704b 100644 --- a/src/core/include/openvino/core/parallel.hpp +++ b/src/core/include/openvino/core/parallel.hpp @@ -17,12 +17,14 @@ #include #include -#define OV_THREAD_TBB 0 -#define OV_THREAD_OMP 1 -#define OV_THREAD_SEQ 2 -#define OV_THREAD_TBB_AUTO 3 - -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#define OV_THREAD_TBB 0 +#define OV_THREAD_OMP 1 +#define OV_THREAD_SEQ 2 +#define OV_THREAD_TBB_AUTO 3 +#define OV_THREAD_TBB_ADAPTIVE 4 + +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) +# define OV_THREAD_USE_TBB 1 # ifndef NOMINMAX # define NOMINMAX # endif @@ -66,7 +68,7 @@ inline int parallel_get_env_threads() { inline void parallel_set_max_nested_levels(int levels) { return; } -# if OV_THREAD == OV_THREAD_TBB +# if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) # define PARTITIONING , tbb::static_partitioner() // The TBB version less than 2018u1 has no static_partitioner argument for @@ -81,6 +83,7 @@ inline void parallel_set_max_nested_levels(int levels) { # define PARTITIONING # endif #elif OV_THREAD == OV_THREAD_OMP +# define OV_THREAD_USE_TBB 0 # include # if !defined(_OPENMP) # error Undefined OpenMP version. @@ -162,6 +165,7 @@ inline int parallel_get_nested_level() { } #elif OV_THREAD == OV_THREAD_SEQ +# define OV_THREAD_USE_TBB 0 # include inline int parallel_get_env_threads() { return 1; @@ -231,7 +235,7 @@ namespace ov { template void parallel_nt(int nthr, const F& func) { -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB if (nthr == 0) nthr = parallel_get_max_threads(); if (nthr == 1) { @@ -279,7 +283,7 @@ void parallel_nt_static(int nthr, const F& func) { if (nthr == 0) nthr = parallel_get_max_threads(); -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB tbb::parallel_for( 0, nthr, @@ -305,7 +309,7 @@ void parallel_nt_static(int nthr, const F& func) { template void parallel_sort(I begin, I end, const F& comparator) { -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB tbb::parallel_sort(begin, end, comparator); #elif OV_THREAD == OV_THREAD_OMP // TODO: propose OpenMP version @@ -317,7 +321,7 @@ void parallel_sort(I begin, I end, const F& comparator) { template R parallel_sum(const T0& D0, const R& input, const F& func) { -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB return _TBB_REDUCE_FUNC( tbb::blocked_range(0, D0), input, @@ -351,7 +355,7 @@ R parallel_sum(const T0& D0, const R& input, const F& func) { template R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) { -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB return _TBB_REDUCE_FUNC( tbb::blocked_range2d(0, D0, 0, D1), input, @@ -391,7 +395,7 @@ R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) { } template R parallel_sum3d(const T0& D0, const T1& D1, const T2& D2, const R& input, const F& func) { -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB return _TBB_REDUCE_FUNC( tbb::blocked_range3d(0, D0, 0, D1, 0, D2), input, @@ -524,7 +528,7 @@ void parallel_for(const T0& D0, const F& func) { if (D0 == T0(0)) { return; } -#if OV_THREAD == OV_THREAD_TBB +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) auto work_amount = static_cast(D0); int nthr = parallel_get_max_threads(); if (static_cast(nthr) > work_amount) @@ -590,7 +594,7 @@ void parallel_for2d(const T0& D0, const T1& D1, const F& func) { if (D0 == T0(0) || D1 == T1(0)) { return; } -#if OV_THREAD == OV_THREAD_TBB +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) auto work_amount = static_cast(D0 * D1); int nthr = parallel_get_max_threads(); if (static_cast(nthr) > work_amount) @@ -636,7 +640,7 @@ void parallel_for2d(const T0& D0, const T1& D1, const F& func) { template void parallel_for2d_dynamic(const T0& D0, const T1& D1, const F& func) { -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB tbb::parallel_for(tbb::blocked_range2d(0, D0, 0, D1), [=](const tbb::blocked_range2d& r) { for (T0 d0 = r.rows().begin(); d0 < r.rows().end(); d0++) { for (T1 d1 = r.cols().begin(); d1 < r.cols().end(); d1++) { @@ -674,7 +678,7 @@ void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) { if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0)) { return; } -#if OV_THREAD == OV_THREAD_TBB +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) auto work_amount = static_cast(D0 * D1 * D2); int nthr = parallel_get_max_threads(); if (static_cast(nthr) > work_amount) @@ -720,7 +724,7 @@ void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) { template void parallel_for3d_dynamic(const T0& D0, const T1& D1, const T2& D2, const F& func) { -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB tbb::parallel_for(tbb::blocked_range3d(0, D0, 0, D1, 0, D2), [=](const tbb::blocked_range3d& r) { for (T0 d0 = r.pages().begin(); d0 < r.pages().end(); d0++) { @@ -762,7 +766,7 @@ void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0) || D3 == T3(0)) { return; } -#if OV_THREAD == OV_THREAD_TBB +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) auto work_amount = static_cast(D0 * D1 * D2 * D3); int nthr = parallel_get_max_threads(); if (static_cast(nthr) > work_amount) @@ -838,7 +842,7 @@ void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0) || D3 == T3(0) || D4 == T4(0)) { return; } -#if OV_THREAD == OV_THREAD_TBB +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4); int nthr = parallel_get_max_threads(); if (static_cast(nthr) > work_amount) @@ -916,7 +920,7 @@ void parallel_for6d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0) || D3 == T3(0) || D4 == T4(0) || D5 == T5(0)) { return; } -#if OV_THREAD == OV_THREAD_TBB +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE) auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4 * D5); int nthr = parallel_get_max_threads(); if (static_cast(nthr) > work_amount) diff --git a/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp b/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp index f57da5f2f5900a..9319c9dc43dff6 100644 --- a/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp +++ b/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp @@ -23,7 +23,15 @@ struct MemBandwidthPressure { float ratio_mem_limited_convs = 0; float ratio_mem_limited_deconvs = 0; float ratio_mem_limited_gemms = 0; + float ratio_mem_limited_adds = 0; float ratio_compute_deconvs = 0; + int total_gemms = 0; + int total_convs = 0; + int total_adds = 0; + int total_light_gemms = 0; + int total_light_convs = 0; + int total_heavy_convs = 0; + int total_nodes = 0; static constexpr float UNKNOWN = FLT_MAX; static constexpr float ALL = 1.0f; diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp index 32f2a5b732b40a..67f53484dcbaec 100644 --- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp @@ -11,7 +11,7 @@ #include "openvino/core/parallel.hpp" -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE # include #else # include @@ -25,7 +25,7 @@ namespace ov { namespace threading { -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE /** * @brief A wrapper class to keep object to be thread local. diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp index eb299728898968..26bbfb5c75d5cc 100644 --- a/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp @@ -13,7 +13,7 @@ #include "openvino/core/parallel.hpp" -#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO)) +#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE)) # include # include #endif @@ -47,7 +47,7 @@ class ThreadSafeQueueWithSize { std::queue _queue; std::mutex _mutex; }; -#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO)) +#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE)) template using ThreadSafeQueue = tbb::concurrent_queue; template diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp index 9d63a0e078bdef..57c69b97ff003e 100644 --- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp @@ -26,6 +26,60 @@ namespace ov { */ namespace intel_cpu { +/** + * @enum TbbPartitioner + * @brief This enum contains definition of the type of TBB partitioner. + */ +enum class TbbPartitioner { + NONE = 0, //!< None value + STATIC = 1, //!< Static partitioner + AUTO = 2 //!< Auto partitioner +}; + +/** @cond INTERNAL */ +inline std::ostream& operator<<(std::ostream& os, const TbbPartitioner& tbb_partitioner) { + switch (tbb_partitioner) { + case TbbPartitioner::STATIC: + return os << "STATIC"; + case TbbPartitioner::AUTO: + return os << "AUTO"; + case TbbPartitioner::NONE: + return os << "NONE"; + default: + OPENVINO_THROW("Unsupported tbb partitioner!"); + } +} + +inline std::istream& operator>>(std::istream& is, TbbPartitioner& tbb_partitioner) { + std::string str; + is >> str; + if (str == "STATIC") { + tbb_partitioner = TbbPartitioner::STATIC; + } else if (str == "AUTO") { + tbb_partitioner = TbbPartitioner::AUTO; + } else if (str == "NONE") { + tbb_partitioner = TbbPartitioner::NONE; + } else { + OPENVINO_THROW("Unsupported tbb partitioner: ", str); + } + return is; +} +/** @endcond */ + +/** + * @brief This property defines the type of TBB partitioner in parallel. + * @ingroup ov_runtime_cpp_prop_api + * + * Developer can use this property to select the type of TBB partitioner. + * + * The following code is an example to set auto partitioner. It is STATIC by default. + * + * @code + * ie.set_property(ov::intel_cpu::tbb_partitioner(ov::intel_cpu::TbbPartitioner::AUTO)); + * @endcode + */ +static constexpr Property tbb_partitioner{"TBB_PARTITIONER"}; + /** * @brief This property define whether to perform denormals optimization. * @ingroup ov_runtime_cpu_prop_cpp_api diff --git a/src/inference/src/dev/performance_heuristics.cpp b/src/inference/src/dev/performance_heuristics.cpp index a0bf9f96fa8e4d..c49f1a5fdacc77 100644 --- a/src/inference/src/dev/performance_heuristics.cpp +++ b/src/inference/src/dev/performance_heuristics.cpp @@ -4,6 +4,7 @@ #include "openvino/runtime/performance_heuristics.hpp" +#include namespace ov { MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr model, @@ -11,7 +12,13 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr float { return (cache_size / (size_data_moved * datatype_size)); }; @@ -26,8 +33,11 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptrget_ordered_ops()) { const auto node_name = node->get_type_info().name; + + total_nodes++; + if (std::strcmp("MatMul", node_name) && std::strcmp("Convolution", node_name) && - std::strcmp("ConvolutionBackpropData", node_name)) { + std::strcmp("Add", node_name) && std::strcmp("ConvolutionBackpropData", node_name)) { if (!std::strcmp("GRUSequence", node_name) || !std::strcmp("TensorIterator", node_name)) { MemBandwidthPressure res; res.max_mem_tolerance = MemBandwidthPressure::UNKNOWN; @@ -65,6 +75,10 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptrinput(1); total_convs++; + + if (kernels.get_partial_shape().is_static() && output.get_partial_shape().is_static()) { + const auto& shapeOutput = output.get_shape(); + const auto& shapeInput1 = kernels.get_shape(); + dataSizeOutput = + std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies()); + auto conv_indicator = dataSizeOutput * data_type_size; + for (size_t n = 1; n < shapeInput1.size(); n++) { + conv_indicator = conv_indicator * shapeInput1[n]; + } + if (conv_indicator < light_convs_threshold) { + total_light_convs++; + } + if (conv_indicator > heavy_convs_threshold) { + total_heavy_convs++; + } + } + if (kernels.get_partial_shape().is_static()) { const auto& shape = kernels.get_shape(); if (shape.size() >= 4 /* conventional 2D/3D conv */ && shape[2] >= 3 && shape[3] >= 3) { @@ -83,6 +115,7 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr 4 /*5D*/ && isINT8) { compute_convs++; continue; @@ -90,7 +123,9 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr()); dataSizeOutput = - std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies()); + dataSizeOutput == 0 + ? std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies()) + : dataSizeOutput; const auto factor = memLimitedFactor(static_cast(dataSizeInput + dataSizeOutput), data_type_size); mem_limited_convs += factor < mem_threshold_assume_limited; worst_case = std::min(factor, worst_case); @@ -116,6 +151,21 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptrinput(0); + const auto output = node->output(0); + // Check that input and output shape a fully defined (not dynamic) + if (input.get_partial_shape().is_static() && output.get_partial_shape().is_static()) { + const auto& shapeInput = input.get_shape(); + const auto& shapeOutput = output.get_shape(); + total_adds++; + dataSizeInput = + std::accumulate(shapeInput.begin(), shapeInput.end(), size_t(1), std::multiplies()); + dataSizeOutput = + std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies()); + const auto factor = memLimitedFactor(static_cast(dataSizeInput + dataSizeOutput), data_type_size); + mem_limited_adds += factor < mem_threshold_assume_limited; + } } } MemBandwidthPressure res; @@ -123,8 +173,16 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr(mem_limited_convs) / total_convs : 0; res.ratio_mem_limited_deconvs = total_deconvs ? static_cast(mem_limited_deconvs) / total_deconvs : 0; res.ratio_mem_limited_gemms = total_gemms ? static_cast(mem_limited_gemms) / total_gemms : 0; + res.ratio_mem_limited_adds = total_adds ? static_cast(mem_limited_adds) / total_adds : 0; res.ratio_compute_convs = total_convs ? static_cast(compute_convs) / total_convs : 0; res.ratio_compute_deconvs = total_deconvs ? static_cast(compute_deconvs) / total_deconvs : 0; + res.total_gemms = total_gemms; + res.total_convs = total_convs; + res.total_adds = total_adds; + res.total_heavy_convs = total_heavy_convs; + res.total_light_convs = total_light_convs; + res.total_light_gemms = total_light_gemms; + res.total_nodes = total_nodes; return res; } diff --git a/src/inference/src/dev/threading/cpu_streams_executor.cpp b/src/inference/src/dev/threading/cpu_streams_executor.cpp index 1fbab9ba5f29b0..10755de34fff69 100644 --- a/src/inference/src/dev/threading/cpu_streams_executor.cpp +++ b/src/inference/src/dev/threading/cpu_streams_executor.cpp @@ -26,7 +26,7 @@ namespace ov { namespace threading { struct CPUStreamsExecutor::Impl { struct Stream { -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE struct Observer : public custom::task_scheduler_observer { CpuSet _mask; int _ncpus = 0; @@ -66,7 +66,7 @@ struct CPUStreamsExecutor::Impl { ((_impl->_config.get_streams() + _impl->_usedNumaNodes.size() - 1) / _impl->_usedNumaNodes.size())) : _impl->_usedNumaNodes.at(_streamId % _impl->_usedNumaNodes.size()); -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE if (_impl->_config.get_streams_info_table().size() > 0) { init_stream(); } @@ -91,14 +91,14 @@ struct CPUStreamsExecutor::Impl { std::lock_guard lock{_impl->_streamIdMutex}; _impl->_streamIdQueue.push(_streamId); } -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE if (nullptr != _observer) { _observer->observe(false); } #endif } -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE void create_tbb_task_arena(const int stream_id, const StreamCreateType stream_type, const int concurrency, @@ -219,7 +219,7 @@ struct CPUStreamsExecutor::Impl { bool _execute = false; std::vector _rank; std::queue _taskQueue; -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE std::unique_ptr _taskArena; std::unique_ptr _observer; std::vector _cpu_ids; @@ -408,7 +408,7 @@ struct CPUStreamsExecutor::Impl { } void Execute(const Task& task, Stream& stream) { -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE auto& arena = stream._taskArena; if (nullptr != arena) { arena->execute(std::move(task)); diff --git a/src/inference/src/dev/threading/executor_manager.cpp b/src/inference/src/dev/threading/executor_manager.cpp index ae6c9ef7fa40d1..c9c9ed6fe9c7ab 100644 --- a/src/inference/src/dev/threading/executor_manager.cpp +++ b/src/inference/src/dev/threading/executor_manager.cpp @@ -7,7 +7,7 @@ #include "openvino/core/parallel.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/threading/cpu_streams_executor.hpp" -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE # if (TBB_INTERFACE_VERSION < 12000) # include # else @@ -47,7 +47,7 @@ class ExecutorManagerImpl : public ExecutorManager { bool tbbTerminateFlag = false; mutable std::mutex global_mutex; bool tbbThreadsCreated = false; -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE # if (TBB_INTERFACE_VERSION < 12000) std::shared_ptr tbbTaskScheduler = nullptr; # else @@ -67,7 +67,7 @@ void ExecutorManagerImpl::set_property(const ov::AnyMap& properties) { for (const auto& it : properties) { if (it.first == ov::force_tbb_terminate.name()) { tbbTerminateFlag = it.second.as(); -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE if (tbbTerminateFlag) { if (!tbbTaskScheduler) { # if (TBB_INTERFACE_VERSION < 12000) @@ -97,7 +97,7 @@ ov::Any ExecutorManagerImpl::get_property(const std::string& name) const { void ExecutorManagerImpl::reset_tbb() { std::lock_guard guard(global_mutex); if (tbbTerminateFlag) { -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE if (tbbTaskScheduler && tbbThreadsCreated) { # if (TBB_INTERFACE_VERSION < 12000) tbbTaskScheduler->terminate(); diff --git a/src/inference/src/dev/threading/parallel_custom_arena.cpp b/src/inference/src/dev/threading/parallel_custom_arena.cpp index 7db44acb6fa9d5..4a25e206c3a0ad 100644 --- a/src/inference/src/dev/threading/parallel_custom_arena.cpp +++ b/src/inference/src/dev/threading/parallel_custom_arena.cpp @@ -7,7 +7,7 @@ #include "dev/threading/itt.hpp" -#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE # define TBB_NUMA_SUPPORT_PRESENT (TBB_INTERFACE_VERSION >= 11100) # if defined(__APPLE__) @@ -332,4 +332,5 @@ int default_concurrency(numa_node_id id) { } // namespace info } // namespace custom -#endif /*OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO*/ +#endif /*OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == \ + OV_THREAD_TBB_ADAPTIVE*/ diff --git a/src/inference/src/dev/threading/parallel_custom_arena.hpp b/src/inference/src/dev/threading/parallel_custom_arena.hpp index 6ac3864c0fc220..e24d8194568216 100644 --- a/src/inference/src/dev/threading/parallel_custom_arena.hpp +++ b/src/inference/src/dev/threading/parallel_custom_arena.hpp @@ -13,7 +13,7 @@ #include "openvino/core/parallel.hpp" #include "openvino/runtime/common.hpp" -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB # include # include @@ -173,5 +173,4 @@ int default_concurrency(numa_node_id id = task_arena::automatic); int default_concurrency(task_arena::constraints c); } // namespace info } // namespace custom -#endif /*(OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)*/ - +#endif /*(OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)*/ diff --git a/src/inference/src/os/cpu_map_info.hpp b/src/inference/src/os/cpu_map_info.hpp index 097057bc054b28..be6f3c587118cb 100644 --- a/src/inference/src/os/cpu_map_info.hpp +++ b/src/inference/src/os/cpu_map_info.hpp @@ -82,6 +82,7 @@ void parse_node_info_linux(const std::vector node_info_table, * @param[out] _numa_nodes total number for nodes in system * @param[out] _sockets total number for sockets in system * @param[out] _cores total number for physical CPU cores in system + * @param[out] _blocked_cores total number for blocked processors in system * @param[out] _proc_type_table summary table of number of processors per type * @param[out] _cpu_mapping_table CPU mapping table for each processor * @return @@ -92,6 +93,7 @@ void parse_cache_info_linux(const std::vector> system_i int& _numa_nodes, int& _sockets, int& _cores, + int& _blocked_cores, std::vector>& _proc_type_table, std::vector>& _cpu_mapping_table); diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp index 165f666bcbf2aa..2b58599e129ed5 100644 --- a/src/inference/src/os/lin/lin_system_conf.cpp +++ b/src/inference/src/os/lin/lin_system_conf.cpp @@ -248,6 +248,7 @@ CPU::CPU() { _numa_nodes, _sockets, _cores, + _blocked_cores, _proc_type_table, _cpu_mapping_table); } @@ -400,6 +401,7 @@ void parse_cache_info_linux(const std::vector> system_i int& _numa_nodes, int& _sockets, int& _cores, + int& _blocked_cores, std::vector>& _proc_type_table, std::vector>& _cpu_mapping_table) { int n_group = 0; @@ -486,6 +488,16 @@ void parse_cache_info_linux(const std::vector> system_i for (int m = core_1; m <= core_2; m++) { update_proc_info(m, core_type); + + if ((core_2 - core_1 == 1) && + _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] == LP_EFFICIENT_CORE_PROC) { + _cpu_mapping_table[m][CPU_MAP_GROUP_ID] = CPU_BLOCKED; + _cpu_mapping_table[m][CPU_MAP_USED_FLAG] = CPU_BLOCKED; + _blocked_cores++; + _cores--; + _proc_type_table[0][ALL_PROC]--; + _proc_type_table[0][_cpu_mapping_table[m][CPU_MAP_CORE_TYPE]]--; + } } } else { core_1 = std::stoi(system_info_table[nproc][0]); @@ -544,11 +556,6 @@ void parse_cache_info_linux(const std::vector> system_i sub_str = system_info_table[n][info_index].substr(endpos + 1); core_2 = std::stoi(sub_str); - if ((info_index == 1) && (core_2 - core_1 == 1) && - (_proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) { - offline_list.push_back(n); - break; - } for (int m = core_1; m <= core_2; m++) { _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets; _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID]; @@ -607,6 +614,8 @@ void parse_cache_info_linux(const std::vector> system_i _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n); _processors--; } + + _processors = _processors - _blocked_cores; }; void get_cpu_mapping_from_cores(const int _processors, diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp index 8786ac601e7cf9..31508c6efecce2 100644 --- a/src/inference/src/os/win/win_system_conf.cpp +++ b/src/inference/src/os/win/win_system_conf.cpp @@ -318,7 +318,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) { phys_cores++; } while (offset < sz); -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB auto core_types = custom::info::core_types(); if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ { phys_cores = custom::info::default_concurrency( @@ -328,7 +328,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) { return phys_cores; } -#if !(OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if !OV_THREAD_USE_TBB // OMP/SEQ threading on the Windows doesn't support NUMA std::vector get_available_numa_nodes() { return {-1}; diff --git a/src/inference/src/system_conf.cpp b/src/inference/src/system_conf.cpp index 318fbf94c65510..dd47185947e381 100644 --- a/src/inference/src/system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -291,7 +291,7 @@ CPU& cpu_info() { int get_number_of_cpu_cores(bool) { return parallel_get_max_threads(); } -# if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO)) +# if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE)) std::vector get_available_numa_nodes() { return {-1}; } @@ -346,7 +346,7 @@ int get_org_numa_id(int numa_node_id) { int get_number_of_cpu_cores(bool) { return parallel_get_max_threads(); } -# if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO)) +# if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE)) std::vector get_available_numa_nodes() { return {-1}; } @@ -420,7 +420,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) { OPENVINO_ASSERT(totalNumberOfCpuCores != 0, "Total number of cpu cores can not be 0."); int phys_cores = totalNumberOfCpuCores; -# if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +# if OV_THREAD_USE_TBB auto core_types = custom::info::core_types(); if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ { phys_cores = custom::info::default_concurrency( @@ -430,7 +430,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) { return phys_cores; } -# if !((OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)) +# if !((OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE)) std::vector get_available_numa_nodes() { CPU& cpu = cpu_info(); std::vector nodes((0 == cpu._numa_nodes) ? 1 : cpu._numa_nodes); @@ -558,7 +558,7 @@ void set_cpu_used(const std::vector& cpu_ids, const int used) { int get_number_of_logical_cpu_cores(bool bigCoresOnly) { int logical_cores = parallel_get_max_threads(); -# if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +# if OV_THREAD_USE_TBB auto core_types = custom::info::core_types(); if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ { logical_cores = custom::info::default_concurrency( @@ -592,7 +592,7 @@ int get_org_numa_id(int numa_node_id) { } #endif -#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO)) +#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE)) std::vector get_available_numa_nodes() { return custom::info::numa_nodes(); } diff --git a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp index 9a200e20c0fa51..09e7854db98d1a 100644 --- a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp +++ b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp @@ -20,6 +20,7 @@ struct LinuxCpuMapTestCase { int _numa_nodes; int _sockets; int _cores; + int _blocked_cores; std::vector> _proc_type_table; std::vector> _cpu_mapping_table; std::vector> system_info_table; @@ -36,6 +37,7 @@ class LinuxCpuMapCacheParserTests : public ov::test::TestsCommon, int test_numa_nodes = 0; int test_sockets = 0; int test_cores = 0; + int test_blocked_cores = 0; std::vector> test_proc_type_table; std::vector> test_cpu_mapping_table; @@ -45,6 +47,7 @@ class LinuxCpuMapCacheParserTests : public ov::test::TestsCommon, test_numa_nodes, test_sockets, test_cores, + test_blocked_cores, test_proc_type_table, test_cpu_mapping_table); @@ -52,6 +55,7 @@ class LinuxCpuMapCacheParserTests : public ov::test::TestsCommon, ASSERT_EQ(test_data._numa_nodes, test_numa_nodes); ASSERT_EQ(test_data._sockets, test_sockets); ASSERT_EQ(test_data._cores, test_cores); + ASSERT_EQ(test_data._blocked_cores, test_blocked_cores); ASSERT_EQ(test_data._proc_type_table, test_proc_type_table); ASSERT_EQ(test_data._cpu_mapping_table, test_cpu_mapping_table); } @@ -82,6 +86,7 @@ LinuxCpuMapTestCase cache_2sockets_104cores_hyperthreading = { 2, // param[expected out]: total 2 numa nodes on this simulated platform 2, // param[expected out]: total 2 sockets on this simulated platform 104, // param[expected out]: total 104 CPU cores on this simulated platform + 0, // param[expected out]: total 0 processes on this simulated platform {{208, 104, 0, 0, 104, -1, -1}, {104, 52, 0, 0, 52, 0, 0}, {104, 52, 0, 0, 52, 1, 1}}, // param[expected out]: The proc_type_table of this simulated platform @@ -304,6 +309,7 @@ LinuxCpuMapTestCase cache_1sockets_96cores = { 1, 1, 96, + 0, {{96, 0, 96, 0, 0, 0, 0}}, { {0, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1}, {1, 0, 0, 1, EFFICIENT_CORE_PROC, 0, -1}, @@ -390,6 +396,7 @@ LinuxCpuMapTestCase cache_2sockets_56cores_hyperthreading = { 2, 2, 56, + 0, {{110, 56, 0, 0, 54, -1, -1}, {54, 28, 0, 0, 26, 0, 0}, {56, 28, 0, 0, 28, 1, 1}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, @@ -572,6 +579,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_hyperthreading = { 2, 2, 48, + 0, {{96, 48, 0, 0, 48, -1, -1}, {48, 24, 0, 0, 24, 0, 0}, {48, 24, 0, 0, 24, 1, 1}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, @@ -664,6 +672,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_hyperthreading_1 = { 4, 2, 48, + 0, {{96, 48, 0, 0, 48, -1, -1}, {24, 12, 0, 0, 12, 0, 0}, {24, 12, 0, 0, 12, 1, 0}, @@ -760,6 +769,7 @@ LinuxCpuMapTestCase cache_2sockets_24cores_hyperthreading = { 2, 2, 24, + 0, {{48, 24, 0, 0, 24, -1, -1}, {24, 12, 0, 0, 12, 0, 0}, {24, 12, 0, 0, 12, 1, 1}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 1, 1, 12, HYPER_THREADING_PROC, 12, -1}, @@ -845,6 +855,7 @@ LinuxCpuMapTestCase cache_2sockets_24cores_hyperthreading_1 = { 4, 2, 24, + 0, {{48, 24, 0, 0, 24, -1, -1}, {12, 6, 0, 0, 6, 0, 0}, {12, 6, 0, 0, 6, 1, 0}, @@ -936,6 +947,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores = { 2, 2, 48, + 0, {{48, 48, 0, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0, 0}, {24, 24, 0, 0, 0, 1, 1}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, @@ -988,6 +1000,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_1 = { 2, 2, 48, + 0, {{48, 48, 0, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0, 0}, {24, 24, 0, 0, 0, 1, 1}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, @@ -1040,6 +1053,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_2 = { 4, 2, 48, + 0, {{48, 48, 0, 0, 0, -1, -1}, {12, 12, 0, 0, 0, 0, 0}, {12, 12, 0, 0, 0, 1, 0}, @@ -1096,6 +1110,7 @@ LinuxCpuMapTestCase cache_2sockets_20cores_hyperthreading = { 2, 2, 20, + 0, {{40, 20, 0, 0, 20, -1, -1}, {20, 10, 0, 0, 10, 0, 0}, {20, 10, 0, 0, 10, 1, 1}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, @@ -1148,6 +1163,7 @@ LinuxCpuMapTestCase cache_2sockets_20cores_hyperthreading_1 = { 2, 2, 20, + 0, {{40, 20, 0, 0, 20, -1, -1}, {20, 10, 0, 0, 10, 0, 0}, {20, 10, 0, 0, 10, 1, 1}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, @@ -1200,18 +1216,31 @@ LinuxCpuMapTestCase cache_1sockets_16cores_hyperthreading = { 1, 1, 14, + 2, {{20, 6, 8, 0, 6, 0, 0}}, { - {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, - {2, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, {3, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, - {4, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {5, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, - {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, - {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, - {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, - {12, 0, 0, 6, EFFICIENT_CORE_PROC, 6, -1}, {13, 0, 0, 7, EFFICIENT_CORE_PROC, 6, -1}, - {14, 0, 0, 8, EFFICIENT_CORE_PROC, 6, -1}, {15, 0, 0, 9, EFFICIENT_CORE_PROC, 6, -1}, - {16, 0, 0, 10, EFFICIENT_CORE_PROC, 7, -1}, {17, 0, 0, 11, EFFICIENT_CORE_PROC, 7, -1}, - {18, 0, 0, 12, EFFICIENT_CORE_PROC, 7, -1}, {19, 0, 0, 13, EFFICIENT_CORE_PROC, 7, -1}, + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, + {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {3, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {4, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {5, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, + {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, EFFICIENT_CORE_PROC, 6, -1}, + {13, 0, 0, 7, EFFICIENT_CORE_PROC, 6, -1}, + {14, 0, 0, 8, EFFICIENT_CORE_PROC, 6, -1}, + {15, 0, 0, 9, EFFICIENT_CORE_PROC, 6, -1}, + {16, 0, 0, 10, EFFICIENT_CORE_PROC, 7, -1}, + {17, 0, 0, 11, EFFICIENT_CORE_PROC, 7, -1}, + {18, 0, 0, 12, EFFICIENT_CORE_PROC, 7, -1}, + {19, 0, 0, 13, EFFICIENT_CORE_PROC, 7, -1}, + {20, 0, 0, 14, LP_EFFICIENT_CORE_PROC, -100, -100}, + {21, 0, 0, 14, LP_EFFICIENT_CORE_PROC, -100, -100}, }, { {"0,5", "0,5", "0-19"}, {"1-2", "1-2", "0-19"}, {"1-2", "1-2", "0-19"}, {"3-4", "3-4", "0-19"}, @@ -1230,6 +1259,7 @@ LinuxCpuMapTestCase cache_1sockets_16cores = { 1, 1, 16, + 0, {{16, 4, 8, 4, 0, 0, 0}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1276,6 +1306,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores_hyperthreading = { 1, 1, 14, + 0, {{20, 6, 8, 0, 6, 0, 0}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1305,6 +1336,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores_hyperthreading_1 = { 1, 1, 14, + 0, {{20, 6, 8, 0, 6, 0, 0}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1334,6 +1366,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores = { 1, 1, 9, + 0, {{9, 1, 8, 0, 0, 0, 0}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1369,6 +1402,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores_2 = { 1, 1, 8, + 0, {{8, 0, 8, 0, 0, 0, 0}}, { {6, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1}, @@ -1403,6 +1437,7 @@ LinuxCpuMapTestCase cache_1sockets_10cores_hyperthreading = { 1, 1, 10, + 0, {{12, 2, 8, 0, 2, 0, 0}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, @@ -1439,6 +1474,7 @@ LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading = { 1, 1, 8, + 0, {{12, 4, 4, 0, 4, 0, 0}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, @@ -1475,6 +1511,7 @@ LinuxCpuMapTestCase cache_1sockets_8cores = { 1, 1, 8, + 0, {{8, 4, 0, 4, 0, 0, 0}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1505,6 +1542,7 @@ LinuxCpuMapTestCase cache_1sockets_8cores_1 = { 1, 1, 8, + 0, {{8, 8, 0, 0, 0, 0, 0}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1533,6 +1571,7 @@ LinuxCpuMapTestCase cache_1sockets_6cores_hyperthreading = { 1, 1, 6, + 0, {{12, 6, 0, 0, 6, 0, 0}}, { {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, @@ -1569,6 +1608,7 @@ LinuxCpuMapTestCase cache_1sockets_4cores = { 1, 1, 4, + 0, {{4, 4, 0, 0, 0, 0, 0}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1589,6 +1629,7 @@ LinuxCpuMapTestCase cache_1sockets_4cores_1 = { 1, 1, 4, + 0, {{4, 0, 4, 0, 0, 0, 0}}, { {0, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1}, @@ -1609,6 +1650,7 @@ LinuxCpuMapTestCase cache_1sockets_4cores_2 = { 1, 1, 4, + 0, {{4, 4, 0, 0, 0, 0, 0}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, @@ -1631,6 +1673,7 @@ LinuxCpuMapTestCase cache_VM_cache_0 = { 0, 0, 0, + 0, {}, {}, { @@ -1653,6 +1696,7 @@ LinuxCpuMapTestCase cache_mock_0 = { 1, 1, 8, + 0, {{8, 0, 8, 0, 0, 0, 0}}, { {6, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1}, @@ -1687,6 +1731,7 @@ LinuxCpuMapTestCase cache_mock_1 = { 1, 1, 2, + 0, {{2, 2, 0, 0, 0, 0, 0}}, { {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index 82188856c853d7..aa43ddc28b5676 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -15,6 +15,7 @@ #include "async_infer_request.h" #include "config.h" +#include "cpu_parallel.hpp" #include "graph.h" #include "graph_context.h" #include "infer_request.h" @@ -198,10 +199,12 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const { std::lock_guard lock{*m_mutex}; auto isQuantizedFlag = (m_cfg.lpTransformsMode == Config::On) && ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model); + auto cpuParallel = std::make_shared(m_cfg.tbbPartitioner); ctx = std::make_shared(m_cfg, m_socketWeights[socketId], isQuantizedFlag, streamsExecutor, + cpuParallel, m_sub_memory_manager); } @@ -298,6 +301,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const { RO_property(ov::log::level.name()), RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::intel_cpu::enable_tensor_parallel.name()), + RO_property(ov::intel_cpu::tbb_partitioner.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), RO_property(ov::key_cache_precision.name()), @@ -380,6 +384,9 @@ ov::Any CompiledModel::get_property(const std::string& name) const { const auto& enable_tensor_parallel = config.enableTensorParallel; return enable_tensor_parallel; } + if (name == ov::intel_cpu::tbb_partitioner) { + return config.tbbPartitioner; + } if (name == ov::hint::dynamic_quantization_group_size) { return static_cast( config.fcDynamicQuantizationGroupSize); diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index dce25d5f24f080..8de4159bff4773 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -192,6 +192,16 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { ov::intel_cpu::sparse_weights_decompression_rate.name(), ". Sparse rate must be in range [0.0f,1.0f]"); fcSparseWeiDecompressionRate = val_f; + } else if (key == ov::intel_cpu::tbb_partitioner.name()) { + try { + tbbPartitioner = val.as(); + } catch (ov::Exception&) { + OPENVINO_THROW("Wrong value ", + val.as(), + "for property key ", + ov::intel_cpu::tbb_partitioner.name(), + ". Expected only ov::intel_cpu::TbbPartitioner::STATIC/AUTO"); + } } else if (key == ov::hint::dynamic_quantization_group_size.name()) { try { fcDynamicQuantizationGroupSizeSetExplicitly = true; diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 68ea781a204c34..4b467c0dd0643f 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -16,6 +16,7 @@ #include "openvino/core/any.hpp" #include "openvino/core/attribute_visitor.hpp" #include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" #include "utils/debug_caps_config.h" @@ -100,6 +101,7 @@ struct Config { bool changedCpuPinning = false; bool enableCpuReservation = false; ov::hint::SchedulingCoreType schedulingCoreType = ov::hint::SchedulingCoreType::ANY_CORE; + ov::intel_cpu::TbbPartitioner tbbPartitioner = ov::intel_cpu::TbbPartitioner::NONE; std::set modelDistributionPolicy; bool enableTensorParallel = false; int streamsRankLevel = 1; @@ -134,6 +136,8 @@ struct Config { std::map _config; int modelPreferThreads = -1; + int modelPreferThreadsLatency = 0; + int modelPreferThreadsThroughput = 0; ModelType modelType = ModelType::Unknown; std::function cacheEncrypt; std::function cacheDecrypt; diff --git a/src/plugins/intel_cpu/src/cpu_parallel.cpp b/src/plugins/intel_cpu/src/cpu_parallel.cpp new file mode 100644 index 00000000000000..c63786d1eee57d --- /dev/null +++ b/src/plugins/intel_cpu/src/cpu_parallel.cpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cpu_parallel.hpp" + +#include +#include + +#include "openvino/runtime/intel_cpu/properties.hpp" +#include "thread_pool_imp.hpp" + +namespace ov::intel_cpu { +CpuParallel::CpuParallel(ov::intel_cpu::TbbPartitioner partitioner, size_t multiplier) + : m_partitioner(partitioner), + m_multiplier(multiplier) { + m_partitioner = + m_partitioner == ov::intel_cpu::TbbPartitioner::NONE ? ov::intel_cpu::TbbPartitioner::STATIC : m_partitioner; + m_thread_pool = std::make_shared(*this); +} + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/cpu_parallel.hpp b/src/plugins/intel_cpu/src/cpu_parallel.hpp new file mode 100644 index 00000000000000..d63434ff975204 --- /dev/null +++ b/src/plugins/intel_cpu/src/cpu_parallel.hpp @@ -0,0 +1,360 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "openvino/core/parallel.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" + +namespace ov::intel_cpu { +class ThreadPool; + +class CpuParallel { +public: + // Default multiplier for the number of virtual threads when tbb partitioner is AUTO. This value is determined + // empirically. + static constexpr int default_multiplier = 32; + + CpuParallel() = delete; + CpuParallel(CpuParallel&) = delete; + CpuParallel(ov::intel_cpu::TbbPartitioner partitioner = ov::intel_cpu::TbbPartitioner::STATIC, + size_t multiplier = default_multiplier); + ~CpuParallel() = default; + + [[nodiscard]] ov::intel_cpu::TbbPartitioner get_partitioner() const { + return m_partitioner; + } + [[nodiscard]] size_t get_multiplier() const { + return m_multiplier; + } + [[nodiscard]] std::shared_ptr get_thread_pool() const { + return m_thread_pool; + } + [[nodiscard]] int get_num_threads() const { + int num = m_partitioner == ov::intel_cpu::TbbPartitioner::STATIC ? parallel_get_max_threads() + : parallel_get_max_threads() * m_multiplier; + return num; + } + void activate() const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + dnnl_threadpool_interop_set_max_concurrency(get_num_threads()); +#endif + } + + template + void parallel_simple(const T0 D0, const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + const auto nthr = D0; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + tbb::parallel_for(0, nthr, [&](int ithr) { + func(ithr, nthr); + }); + } else { + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + func(ithr, nthr); + }, + tbb::static_partitioner()); + } +#else + ov::parallel_for(D0, func); +#endif + } + + template + [[nodiscard]] R parallel_sum(const T0& D0, const R& input, const F& func) const { + return cpu_parallel_sum(D0, input, func); + } + template + void parallel_for(const T0& D0, const F& func) const { + cpu_parallel_for(D0, func); + } + template + void parallel_for2d(const T0& D0, const T1& D1, const F& func) const { + cpu_parallel_for2d(D0, D1, func); + } + template + void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) const { + cpu_parallel_for3d(D0, D1, D2, func); + } + template + void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) const { + cpu_parallel_for4d(D0, D1, D2, D3, func); + } + template + void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const F& func) const { + cpu_parallel_for5d(D0, D1, D2, D3, D4, func); + } + template + void parallel_for6d(const T0& D0, + const T1& D1, + const T2& D2, + const T3& D3, + const T4& D4, + const T5& D5, + const F& func) const { + cpu_parallel_for6d(D0, D1, D2, D3, D4, D5, func); + } + +private: + template + [[nodiscard]] R cpu_parallel_sum(const T0& D0, const R& input, const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + R res_sum = 0; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + res_sum = _TBB_REDUCE_FUNC( + tbb::blocked_range(0, D0), + input, + [&](const tbb::blocked_range& r, R init) -> R { + R sum = init; + for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1) { + sum += func(dim1); + } + return sum; + }, + [](R x, R y) -> R { + return x + y; + }); + } else { + res_sum = _TBB_REDUCE_FUNC( + tbb::blocked_range(0, D0), + input, + [&](const tbb::blocked_range& r, R init) -> R { + R sum = init; + for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1) { + sum += func(dim1); + } + return sum; + }, + [](R x, R y) -> R { + return x + y; + }, + tbb::static_partitioner()); + } + return res_sum; +#else + return ov::parallel_sum(D0, input, func); +#endif + } + + template + void cpu_parallel_for(const T0& D0, const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + auto work_amount = static_cast(D0); + const int nthr = parallel_get_max_threads(); + int virtual_threads = nthr; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier; + } + if (virtual_threads > work_amount) { + virtual_threads = work_amount; + } + if (virtual_threads == 1) { + for_1d(0, 1, D0, func); + } else { + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + tbb::parallel_for(0, virtual_threads, [&](int ithr) { + for_1d(ithr, virtual_threads, D0, func); + }); + } else { + tbb::parallel_for( + 0, + virtual_threads, + [&](int ithr) { + for_1d(ithr, virtual_threads, D0, func); + }, + tbb::static_partitioner()); + } + } +#else + ov::parallel_for(D0, func); // from core +#endif + } + + template + void cpu_parallel_for2d(const T0& D0, const T1& D1, const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + auto work_amount = static_cast(D0 * D1); + const int nthr = parallel_get_max_threads(); + int virtual_threads = nthr; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier; + } + if (virtual_threads > work_amount) { + virtual_threads = work_amount; + } + if (virtual_threads == 1) { + for_2d(0, 1, D0, D1, func); + } else { + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + tbb::parallel_for(0, virtual_threads, [&](int ithr) { + for_2d(ithr, virtual_threads, D0, D1, func); + }); + } else { + tbb::parallel_for( + 0, + virtual_threads, + [&](int ithr) { + for_2d(ithr, virtual_threads, D0, D1, func); + }, + tbb::static_partitioner()); + } + } +#else + ov::parallel_for2d(D0, D1, func); +#endif + } + + template + void cpu_parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + auto work_amount = static_cast(D0 * D1 * D2); + const int nthr = parallel_get_max_threads(); + int virtual_threads = nthr; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier; + } + if (virtual_threads > work_amount) { + virtual_threads = work_amount; + } + if (virtual_threads == 1) { + for_3d(0, 1, D0, D1, D2, func); + } else { + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + tbb::parallel_for(0, virtual_threads, [&](int ithr) { + for_3d(ithr, virtual_threads, D0, D1, D2, func); + }); + } else { + tbb::parallel_for( + 0, + virtual_threads, + [&](int ithr) { + for_3d(ithr, virtual_threads, D0, D1, D2, func); + }, + tbb::static_partitioner()); + } + } +#else + ov::parallel_for3d(D0, D1, D2, func); +#endif + } + + template + void cpu_parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + auto work_amount = static_cast(D0 * D1 * D2 * D3); + const int nthr = parallel_get_max_threads(); + int virtual_threads = nthr; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier; + } + if (virtual_threads > work_amount) { + virtual_threads = work_amount; + } + if (virtual_threads == 1) { + for_4d(0, 1, D0, D1, D2, D3, func); + } else { + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + tbb::parallel_for(0, virtual_threads, [&](int ithr) { + for_4d(ithr, virtual_threads, D0, D1, D2, D3, func); + }); + } else { + tbb::parallel_for( + 0, + virtual_threads, + [&](int ithr) { + for_4d(ithr, virtual_threads, D0, D1, D2, D3, func); + }, + tbb::static_partitioner()); + } + } +#else + ov::parallel_for4d(D0, D1, D2, D3, func); +#endif + } + + template + void cpu_parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4); + const int nthr = parallel_get_max_threads(); + int virtual_threads = nthr; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier; + } + if (virtual_threads > work_amount) { + virtual_threads = work_amount; + } + if (virtual_threads == 1) { + for_5d(0, 1, D0, D1, D2, D3, D4, func); + } else { + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + tbb::parallel_for(0, virtual_threads, [&](int ithr) { + for_5d(ithr, virtual_threads, D0, D1, D2, D3, D4, func); + }); + } else { + tbb::parallel_for( + 0, + virtual_threads, + [&](int ithr) { + for_5d(ithr, virtual_threads, D0, D1, D2, D3, D4, func); + }, + tbb::static_partitioner()); + } + } +#else + ov::parallel_for5d(D0, D1, D2, D3, D4, func); +#endif + } + + template + void cpu_parallel_for6d(const T0& D0, + const T1& D1, + const T2& D2, + const T3& D3, + const T4& D4, + const T5& D5, + const F& func) const { +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4 * D5); + const int nthr = parallel_get_max_threads(); + int virtual_threads = nthr; + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier; + } + if (virtual_threads > work_amount) { + virtual_threads = work_amount; + } + if (virtual_threads == 1) { + for_6d(0, 1, D0, D1, D2, D3, D4, D5, func); + } else { + if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) { + tbb::parallel_for(0, virtual_threads, [&](int ithr) { + for_6d(ithr, virtual_threads, D0, D1, D2, D3, D4, D5, func); + }); + } else { + tbb::parallel_for( + 0, + virtual_threads, + [&](int ithr) { + for_6d(ithr, virtual_threads, D0, D1, D2, D3, D4, D5, func); + }, + tbb::static_partitioner()); + } + } +#else + ov::parallel_for6d(D0, D1, D2, D3, D4, D5, func); +#endif + } + + ov::intel_cpu::TbbPartitioner m_partitioner = ov::intel_cpu::TbbPartitioner::STATIC; + size_t m_multiplier = default_multiplier; + std::shared_ptr m_thread_pool = nullptr; +}; + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp index 6b14e38d2ae015..9825f4ce948f60 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp @@ -19,6 +19,7 @@ #include "openvino/core/any.hpp" #include "openvino/core/except.hpp" #include "openvino/core/model.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/system_conf.hpp" @@ -34,7 +35,6 @@ #include "openvino/op/fake_quantize.hpp" #include "openvino/runtime/threading/cpu_streams_info.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" -#include "transformations/utils.hpp" #include "transformations/utils/utils.hpp" #include "utils/general_utils.h" @@ -46,7 +46,7 @@ constexpr int TP_CPU_LIMIT = 32; namespace ov::intel_cpu { -void sort_table_by_numa_node_id(const int current_numa_node, std::vector>& proc_type_table) { +void sort_table_by_numa_node_id(int current_numa_node, std::vector>& proc_type_table) { if (proc_type_table.size() > 1) { for (size_t i = 1; i < proc_type_table.size(); i++) { if (current_numa_node == proc_type_table[i][PROC_NUMA_NODE_ID]) { @@ -608,14 +608,39 @@ int get_model_prefer_threads(const int num_streams, const std::vector>& proc_type_table, const std::shared_ptr& model, Config& config) { + bool int8_intensive = ov::op::util::has_op_with_type(model); + + auto default_prefer_threads_latency = [&]() { + bool llm_related = ov::op::util::is_large_language_model(*model); + const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; + const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; + // By default the latency case uses (faster) Big cores only, depending on the compute ratio + // But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big + // cores only cases except LLM. + bool use_all_cores = + proc_type_table[0][MAIN_CORE_PROC] <= (proc_type_table[0][EFFICIENT_CORE_PROC] / + (int8_intensive || llm_related ? int8_threshold : fp32_threshold)); + bool use_big_and_little = !llm_related && (ov::get_number_of_blocked_cores() != 0); + + if (use_all_cores || use_big_and_little) { + config.modelPreferThreadsLatency = + proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC]; + } else { + config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC]; + } + return; + }; + const int sockets = get_num_sockets(); - auto model_prefer = 0; + if (-1 == config.modelPreferThreads) { + config.modelPreferThreads = 0; #if (defined(OPENVINO_ARCH_ARM64) && defined(__linux__)) - config.modelPreferThreads = 8; + config.modelPreferThreadsThroughput = 8; if (dnnl::impl::cpu::aarch64::mayiuse(dnnl::impl::cpu::aarch64::cpu_isa_t::sve_128)) { - config.modelPreferThreads = 16; + config.modelPreferThreadsThroughput = 16; } + default_prefer_threads_latency(); #else const auto isa = dnnl::get_effective_cpu_isa(); float isaSpecificThreshold = 1.0F; @@ -648,96 +673,180 @@ int get_model_prefer_threads(const int num_streams, config.inferencePrecision); # if (defined(OPENVINO_ARCH_ARM) && defined(__linux__)) - config.modelPreferThreads = 4; + config.modelPreferThreadsThroughput = 4; if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { if (networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) { - config.modelPreferThreads = 8; + config.modelPreferThreadsThroughput = 8; } } else if ((networkToleranceForLowCache.max_mem_tolerance < ov::MemBandwidthPressure::LIMITED) && ((networkToleranceForLowCache.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED) || (networkToleranceForLowCache.ratio_mem_limited_gemms > ov::MemBandwidthPressure::LIMITED))) { - config.modelPreferThreads = 8; + config.modelPreferThreadsThroughput = 8; + } + default_prefer_threads_latency(); + +# elif (defined(OPENVINO_ARCH_ARM) && defined(__APPLE__)) + if ((proc_type_table.size() == 1) && (proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) { + config.modelPreferThreadsLatency = + proc_type_table[0][MAIN_CORE_PROC] > proc_type_table[0][EFFICIENT_CORE_PROC] + ? proc_type_table[0][MAIN_CORE_PROC] + : proc_type_table[0][ALL_PROC]; + } else { + default_prefer_threads_latency(); } -# elif ((defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)) - config.modelPreferThreads = 1; + config.modelPreferThreadsThroughput = 1; if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) || (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) { // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams - config.modelPreferThreads = 4; + config.modelPreferThreadsThroughput = 4; } // otherwise (no recognized layers) falling back to the default value } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) { // network is below the ISA-specific threshold - config.modelPreferThreads = 1; + config.modelPreferThreadsThroughput = 1; } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) { // network is below general threshold - config.modelPreferThreads = 1; + config.modelPreferThreadsThroughput = 1; } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED && networkToleranceForLowCache.ratio_compute_convs < ov::MemBandwidthPressure::ALL) { - config.modelPreferThreads = 4; + config.modelPreferThreadsThroughput = 4; } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs <= ov::MemBandwidthPressure::LIMITED && networkToleranceForLowCache.ratio_mem_limited_convs <= ov::MemBandwidthPressure::LIMITED && networkToleranceForLowCache.ratio_compute_convs > ov::MemBandwidthPressure::LIMITED) { - config.modelPreferThreads = 2; + config.modelPreferThreadsThroughput = 2; } # else - config.modelPreferThreads = 0; + if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) { + if ((proc_type_table[0][MAIN_CORE_PROC] < config.threads || config.threads == 0) && + (ov::get_number_of_blocked_cores() || proc_type_table[0][LP_EFFICIENT_CORE_PROC] > 0) && + proc_type_table[0][EFFICIENT_CORE_PROC] <= 2 * proc_type_table[0][MAIN_CORE_PROC]) { + if (ov::op::util::is_large_language_model(*model)) { + config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC]; + } else { + config.modelPreferThreadsLatency = + proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC]; + if (config.tbbPartitioner == TbbPartitioner::NONE) { + if (proc_type_table[0][LP_EFFICIENT_CORE_PROC] > 0 && int8_intensive && + networkToleranceForLowCache.total_convs > 0) { + bool main_core_case_1 = networkToleranceForLowCache.ratio_mem_limited_convs > 0.8F; + bool main_core_case_2 = networkToleranceForLowCache.ratio_mem_limited_convs == 0.0F && + networkToleranceForLowCache.ratio_compute_convs == 0.0F && + networkToleranceForLowCache.max_mem_tolerance >= 4.5F; + bool main_core_case_3 = + networkToleranceForLowCache.ratio_mem_limited_convs == 0.0F && + networkToleranceForLowCache.ratio_compute_convs > 0.0F && + networkToleranceForLowCache.ratio_compute_convs < 1.0F && + static_cast(networkToleranceForLowCache.total_light_convs) > + 0.9F * static_cast(networkToleranceForLowCache.total_convs); + bool main_core_case_4 = + networkToleranceForLowCache.ratio_mem_limited_convs > 0.0F && + networkToleranceForLowCache.ratio_compute_convs > 0.0F && + static_cast(networkToleranceForLowCache.total_light_convs) > + 0.46F * static_cast(networkToleranceForLowCache.total_convs); + if (main_core_case_1 || main_core_case_2 || main_core_case_3 || main_core_case_4) { + config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC]; + config.tbbPartitioner = TbbPartitioner::STATIC; + } + } + if (config.tbbPartitioner == TbbPartitioner::NONE) { + bool static_case_1 = networkToleranceForLowCache.total_nodes == 0; + bool static_case_2 = networkToleranceForLowCache.total_convs > 0 && + static_cast(networkToleranceForLowCache.total_light_convs) > + 0.6F * static_cast(networkToleranceForLowCache.total_convs); + bool static_case_3 = false; + bool static_case_4 = false; + bool static_case_5 = false; + if (proc_type_table[0][LP_EFFICIENT_CORE_PROC] > 0) { + static_case_3 = + networkToleranceForLowCache.total_convs > 0 && + static_cast(networkToleranceForLowCache.total_light_convs) <= + 0.6F * static_cast(networkToleranceForLowCache.total_convs) && + networkToleranceForLowCache.ratio_compute_convs + + networkToleranceForLowCache.ratio_mem_limited_convs < + 0.9F && + networkToleranceForLowCache.ratio_mem_limited_convs < 0.2F && + networkToleranceForLowCache.ratio_mem_limited_gemms == 0.0F && + ((networkToleranceForLowCache.ratio_mem_limited_adds < 0.28F && + networkToleranceForLowCache.max_mem_tolerance >= 0.06F) || + networkToleranceForLowCache.ratio_compute_convs == 0 || + networkToleranceForLowCache.ratio_mem_limited_convs == 0); + static_case_4 = + networkToleranceForLowCache.total_convs == 0 && + (networkToleranceForLowCache.max_mem_tolerance > 2.5F || + static_cast(networkToleranceForLowCache.total_gemms) >= + 0.14F * static_cast(networkToleranceForLowCache.total_nodes)); + static_case_5 = + networkToleranceForLowCache.total_convs > 0 && + static_cast(networkToleranceForLowCache.total_light_convs) <= + 0.6F * static_cast(networkToleranceForLowCache.total_convs) && + networkToleranceForLowCache.ratio_compute_convs >= + 0.9F * networkToleranceForLowCache.ratio_mem_limited_convs && + networkToleranceForLowCache.ratio_compute_convs == 1.0F && + networkToleranceForLowCache.ratio_mem_limited_adds == 1.0F && + static_cast(networkToleranceForLowCache.total_heavy_convs) > + 0.1F * static_cast(networkToleranceForLowCache.total_nodes); + } else { + static_case_3 = + networkToleranceForLowCache.total_convs > 0 && + static_cast(networkToleranceForLowCache.total_light_convs) <= + 0.6F * static_cast(networkToleranceForLowCache.total_convs) && + networkToleranceForLowCache.ratio_compute_convs + + networkToleranceForLowCache.ratio_mem_limited_convs < + 0.9F && + networkToleranceForLowCache.ratio_mem_limited_convs < 0.2F && + networkToleranceForLowCache.ratio_mem_limited_gemms == 0.0F && + networkToleranceForLowCache.ratio_mem_limited_adds < 0.28F && + networkToleranceForLowCache.max_mem_tolerance >= 0.06F; + static_case_4 = networkToleranceForLowCache.total_convs == 0 && + static_cast(networkToleranceForLowCache.total_gemms) < + 0.05F * static_cast(networkToleranceForLowCache.total_nodes); + } + if (static_case_1 || static_case_2 || static_case_3 || static_case_4 || static_case_5) { + config.tbbPartitioner = TbbPartitioner::STATIC; + } else { + config.tbbPartitioner = TbbPartitioner::AUTO; + } + } + } + } + } else { + default_prefer_threads_latency(); + } + } else { + config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC] > 0 + ? proc_type_table[0][MAIN_CORE_PROC] + : proc_type_table[0][EFFICIENT_CORE_PROC]; + } + config.modelPreferThreadsThroughput = 0; if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { if (any_of(ov::MemBandwidthPressure::ALL, networkToleranceForLowCache.ratio_compute_convs, networkToleranceForLowCache.ratio_compute_deconvs)) { // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams - config.modelPreferThreads = 1; + config.modelPreferThreadsThroughput = 1; } // otherwise (no recognized layers) falling back to the default value } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) { // network is below the ISA-specific threshold - config.modelPreferThreads = 1; + config.modelPreferThreadsThroughput = 1; } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) { // network is below general threshold - config.modelPreferThreads = 2; + config.modelPreferThreadsThroughput = 2; } - if (config.modelPreferThreads == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 && + if (config.modelPreferThreadsThroughput == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 && (proc_type_table[0][HYPER_THREADING_PROC] == proc_type_table[0][MAIN_CORE_PROC])) { - config.modelPreferThreads = 2; + config.modelPreferThreadsThroughput = 2; } # endif #endif } - // latency - if (num_streams <= sockets && num_streams > 0) { - if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) { -#ifdef __APPLE__ - if ((proc_type_table.size() == 1) && (proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) { - model_prefer = proc_type_table[0][MAIN_CORE_PROC] > proc_type_table[0][EFFICIENT_CORE_PROC] - ? proc_type_table[0][MAIN_CORE_PROC] - : proc_type_table[0][ALL_PROC]; - } -#else - bool llm_related = has_matmul_with_compressed_weights(model); - bool int8_intensive = ov::op::util::has_op_with_type(model) || llm_related; - const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; - const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; - // By default the latency case uses (faster) Big cores only, depending on the compute ratio - // But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big - // cores only cases except LLM. - bool use_all_cores = - proc_type_table[0][MAIN_CORE_PROC] <= - (proc_type_table[0][EFFICIENT_CORE_PROC] / (int8_intensive ? int8_threshold : fp32_threshold)); - bool use_big_and_little = !llm_related && (ov::get_number_of_blocked_cores() != 0); - - if (use_all_cores || use_big_and_little) { - model_prefer = proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC]; - } else { - model_prefer = proc_type_table[0][MAIN_CORE_PROC]; - } -#endif - } - } else { // throughput - model_prefer = config.modelPreferThreads; + if (num_streams > sockets || num_streams == 0) { + config.modelPreferThreads = config.modelPreferThreadsThroughput; + } else { + config.modelPreferThreads = config.modelPreferThreadsLatency; } - return model_prefer; + return config.modelPreferThreads; } std::vector> generate_stream_info(const int streams, @@ -760,7 +869,7 @@ std::vector> generate_stream_info(const int streams, } if (proc_type_table.size() > 1) { - const auto cur_numa_node_id = input_numa_node_id < 0 ? get_current_numa_node_id() : input_numa_node_id; + int cur_numa_node_id = input_numa_node_id < 0 ? get_current_numa_node_id() : input_numa_node_id; sort_table_by_numa_node_id(cur_numa_node_id, proc_type_table); } OPENVINO_ASSERT(!proc_type_table.empty() && proc_type_table[0][ALL_PROC] != 0, @@ -774,6 +883,8 @@ std::vector> generate_stream_info(const int streams, ov::util::to_string(config.hintPerfMode), config.modelDistributionPolicy, proc_type_table); + config.tbbPartitioner = + config.tbbPartitioner == TbbPartitioner::NONE ? TbbPartitioner::STATIC : config.tbbPartitioner; OPENVINO_ASSERT(!streams_info_table.empty(), "streams_info_table is empty!"); if (config.modelDistributionPolicy.find(ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL) != config.modelDistributionPolicy.end()) { diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp index 5c8a890eba740f..01f713a899fa25 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp @@ -117,5 +117,4 @@ void get_num_streams(int streams, const std::shared_ptr& model, Confi * @param[in] proc_type_table summary table of number of processors per type */ void sort_table_by_numa_node_id(int current_numa_node, std::vector>& proc_type_table); - } // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 8620ced8a5ae92..167a14d65decb2 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -69,17 +69,19 @@ #include "openvino/runtime/so_ptr.hpp" #include "perf_count.h" #include "proxy_mem_blk.h" +#include "thread_pool_imp.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" #include "utils/node_dumper.h" #include "utils/verbose.h" #include "weights_cache.hpp" -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_OMP) +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE || \ + OV_THREAD == OV_THREAD_OMP) # include #endif -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +#if OV_THREAD_USE_TBB # include #endif @@ -121,7 +123,8 @@ void Graph::Init(const std::vector& graphNodes, } m_context = context; - m_stream = dnnl::stream(getEngine()); + m_stream = make_stream(getEngine(), m_context->getCpuParallel()->get_thread_pool()); + m_context->getCpuParallel()->activate(); this->_name = std::move(name); @@ -377,7 +380,8 @@ void Graph::Init(const std::shared_ptr& model, } m_context = context; - m_stream = dnnl::stream(getEngine()); + m_stream = make_stream(getEngine(), m_context->getCpuParallel()->get_thread_pool()); + m_context->getCpuParallel()->activate(); Replicate(model, inputConfigs, outputConfigs); @@ -1385,7 +1389,8 @@ class UpdateNodesSeq { using UpdateNodes = UpdateNodesSeq; #endif -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_OMP) +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE || \ + OV_THREAD == OV_THREAD_OMP) class UpdateNodesBase { public: @@ -1432,7 +1437,7 @@ class UpdateNodesBase { }; // NOLINTBEGIN(misc-include-cleaner) tbb has multiple implicit includes, which are not supposed to be included directly -# if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +# if OV_THREAD_USE_TBB # if (TBB_VERSION_MAJOR > 2020) template class AsyncTask : public tbb::detail::d1::task { diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp index 458f0dddf418be..60a74e816fcc5e 100644 --- a/src/plugins/intel_cpu/src/graph_context.cpp +++ b/src/plugins/intel_cpu/src/graph_context.cpp @@ -10,6 +10,7 @@ #include "cache/multi_cache.h" #include "config.h" +#include "cpu_parallel.hpp" #include "dnnl_scratch_pad.h" #include "memory_control.hpp" #include "nodes/memory.hpp" @@ -25,6 +26,7 @@ GraphContext::GraphContext(Config config, WeightsSharing::Ptr w_cache, bool isGraphQuantized, ov::threading::IStreamsExecutor::Ptr streamExecutor, + std::shared_ptr cpuParallel, std::shared_ptr sub_memory_manager) : m_config(std::move(config)), m_weightsCache(std::move(w_cache)), @@ -32,6 +34,7 @@ GraphContext::GraphContext(Config config, m_snippetsParamsCache(std::make_shared(m_config.snippetsCacheCapacity)), m_isGraphQuantizedFlag(isGraphQuantized), m_streamExecutor(std::move(streamExecutor)), + m_cpuParallel(std::move(cpuParallel)), m_subMemoryManager(std::move(sub_memory_manager)), m_memoryStatesRegister(std::make_shared()), @@ -51,6 +54,10 @@ GraphContext::GraphContext(Config config, for (int i = 0; i < numaNum; i++) { m_rtScratchPads.push_back(std::make_shared(getEngine(), i)); } + + if (!m_cpuParallel) { + m_cpuParallel = std::make_shared(m_config.tbbPartitioner); + } } const dnnl::engine& GraphContext::getEngine() { diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h index 254762ca1c9ea7..656e25f810ea3c 100644 --- a/src/plugins/intel_cpu/src/graph_context.h +++ b/src/plugins/intel_cpu/src/graph_context.h @@ -10,6 +10,7 @@ #include "cache/multi_cache.h" #include "config.h" +#include "cpu_parallel.hpp" #include "dnnl_scratch_pad.h" #include "memory_control.hpp" #include "openvino/runtime/threading/cpu_streams_executor.hpp" @@ -35,6 +36,7 @@ class GraphContext { WeightsSharing::Ptr w_cache, bool isGraphQuantized, ov::threading::IStreamsExecutor::Ptr streamExecutor = nullptr, + std::shared_ptr cpuParallel = nullptr, std::shared_ptr sub_memory_manager = nullptr); [[nodiscard]] const Config& getConfig() const { @@ -71,6 +73,10 @@ class GraphContext { return m_cpuStreamExecutor; } + [[nodiscard]] std::shared_ptr getCpuParallel() const { + return m_cpuParallel; + } + [[nodiscard]] std::shared_ptr getSubMemory() const { return m_subMemoryManager; } @@ -121,6 +127,7 @@ class GraphContext { ov::threading::IStreamsExecutor::Ptr m_streamExecutor; // cpu stream executor for current graph ov::threading::CPUStreamsExecutor::Ptr m_cpuStreamExecutor; + std::shared_ptr m_cpuParallel = nullptr; // numa submemory manager std::shared_ptr m_subMemoryManager; diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 680444a7d74554..86a71ff4b6a941 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -811,6 +811,7 @@ void Node::updateDynamicParams() { getName(), " ", getOriginalLayers()); + context->getCpuParallel()->activate(); prepareParams(); } } @@ -1116,7 +1117,10 @@ void Node::prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx) { Memory memory{engine, newDesc, internalBlob->getData()}; MemoryPtr _ptr = std::make_shared(engine, intDesc); - node::Reorder::reorderData(memory, *_ptr, context->getParamsCache()); + node::Reorder::reorderData(memory, + *_ptr, + context->getParamsCache(), + context->getCpuParallel()->get_thread_pool()); return _ptr; }; @@ -1150,7 +1154,10 @@ MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryD auto create = [&]() { Memory srcMemory{getEngine(), srcWeightDesc, edgeMem->getData()}; MemoryPtr _ptr = std::make_shared(getEngine(), dstWeightDesc); - node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache()); + node::Reorder::reorderData(srcMemory, + *_ptr, + context->getParamsCache(), + context->getCpuParallel()->get_thread_pool()); return _ptr; }; diff --git a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp index 0d30084da1586a..a8699c1ab3138e 100644 --- a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp @@ -25,7 +25,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/adaptive_avg_pool.hpp" @@ -139,6 +138,7 @@ void AdaptivePooling::executeDynamicImpl(const dnnl::stream& strm) { } void AdaptivePooling::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); auto inputPrec = getParentEdgeAt(0)->getMemory().getDataType(); auto outputPrec = getChildEdgeAt(0)->getMemory().getDataType(); CPU_NODE_ASSERT(inputPrec == dnnl_f32 && outputPrec == dnnl_f32, "doesn't support demanded precisions"); @@ -264,7 +264,7 @@ void AdaptivePooling::execute([[maybe_unused]] const dnnl::stream& strm) { pool = poolAvg; } - parallel_for5d(N, blockCount, OD, OH, OW, [&](int n, int blkIdx, int od, int oh, int ow) { + cpu_parallel->parallel_for5d(N, blockCount, OD, OH, OW, [&](int n, int blkIdx, int od, int oh, int ow) { const auto* srcData = src + n * inStrides[0] + blkIdx * inStrides[1]; auto* dstData = dst + n * outStrides[0] + blkIdx * outStrides[1] + od * outStrides[2] + oh * outStrides[3] + ow * outStrides[4]; diff --git a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp index 84f20afce5b105..100724d5aa86d5 100644 --- a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp @@ -33,7 +33,6 @@ #include "openvino/core/enum_names.hpp" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/binary_convolution.hpp" @@ -1270,6 +1269,7 @@ void BinaryConvolution::executeOptimized(const uint8_t* src, const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) { + const auto& cpu_parallel = context->getCpuParallel(); auto* dst_f32 = reinterpret_cast(dst); const int MB = jcp.mb; @@ -1277,7 +1277,7 @@ void BinaryConvolution::executeOptimized(const uint8_t* src, int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); int nbits = 8; - parallel_for4d(MB, jcp.ngroups, ocb_work, jcp.oh, [&](int n, int g, int ocbb, int oh) { + cpu_parallel->parallel_for4d(MB, jcp.ngroups, ocb_work, jcp.oh, [&](int n, int g, int ocbb, int oh) { int ocb = ocbb * jcp.nb_oc_blocking; int ocb_num = jcp.nb_oc_blocking; @@ -1326,6 +1326,7 @@ void BinaryConvolution::executeReference(const uint8_t* src, const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) const { + const auto& cpu_parallel = context->getCpuParallel(); auto* dst_fp = reinterpret_cast(dst); const bool with_groups = jcp.ngroups > 1; @@ -1393,7 +1394,7 @@ void BinaryConvolution::executeReference(const uint8_t* src, } }; - parallel_for5d(G, MB, OC, OH, OW, [&](int g, int mb, int oc, int oh, int ow) { + cpu_parallel->parallel_for5d(G, MB, OC, OH, OW, [&](int g, int mb, int oc, int oh, int ow) { int32_t a = 0; ker(a, g, mb, oc, oh, ow); diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.cpp b/src/plugins/intel_cpu/src/nodes/bucketize.cpp index 77e5b68809c732..1b7acd45d72ade 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.cpp +++ b/src/plugins/intel_cpu/src/nodes/bucketize.cpp @@ -22,7 +22,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/type/element_type_traits.hpp" @@ -232,6 +231,7 @@ void Bucketize::bucketize() { const auto* input_data = getSrcDataAtPortAs(0); const auto* boundaries_data = getSrcDataAtPortAs(1); auto* output_data = getDstDataAtPortAs(0); + const auto& cpu_parallel = context->getCpuParallel(); if (!with_bins) { memset(output_data, 0, num_values * sizeof(T_IND)); @@ -239,7 +239,7 @@ void Bucketize::bucketize() { } // boundaries are assumed to be sorted and to have unique elements - parallel_for(num_values, [&](size_t ind) { + cpu_parallel->parallel_for(num_values, [&](size_t ind) { T value = input_data[ind]; if (with_right) { const auto* low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value); diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 4c2b6320a01bc9..15826f3d7a2ab7 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -605,6 +605,7 @@ void Concat::exec1DCase() { } void Concat::execNspcSpecCase() { + const auto& cpu_parallel = context->getCpuParallel(); const auto& dst_memory = getChildEdgeAt(0)->getMemory(); const size_t num_src = getParentEdges().size(); auto* dst_ptr = dst_memory.getDataAs(); @@ -638,7 +639,7 @@ void Concat::execNspcSpecCase() { const Shape& shape = getSrcMemoryAtPort(firstNonZeroEdge)->getShape(); const size_t iter_count = shape.getElementsCount() / shape.getStaticDims()[channelAxis]; - parallel_for(iter_count, [&](int i) { + cpu_parallel->parallel_for(iter_count, [&](int i) { const size_t dst_off = i * channels_size; for (size_t j = 0; j < nonZeroInShapes; j++) { cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channelsDataSize[j], channelsDataSize[j]); @@ -647,6 +648,7 @@ void Concat::execNspcSpecCase() { } void Concat::execRef() { + const auto& cpu_parallel = context->getCpuParallel(); const size_t numSrc = getParentEdges().size(); const auto& dstMemory = getChildEdgeAt(0)->getMemory(); auto* dstPtr = dstMemory.getDataAs(); @@ -695,65 +697,65 @@ void Concat::execRef() { } const auto L1Size = dnnl::utils::get_cache_size(1, true); UNUSED(L1Size); // for Windows - parallel_for6d(physDims[0], - physDims[1], - physDims[2], - physDims[3], - physDims[4], - numSrc, - [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) { - // check if zero memory - if (srcPtrs[a] == nullptr) { - return; - } - - size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 + - inputStrides[a][3] * n3 + inputStrides[a][4] * n4; - size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 + - outputStrides[3] * n3 + outputStrides[4] * n4; - const uint8_t* i = &srcPtrs[a][inOff]; - uint8_t* o = &dstPtr[dstOffset[a] + outOff]; + cpu_parallel->parallel_for6d( + physDims[0], + physDims[1], + physDims[2], + physDims[3], + physDims[4], + numSrc, + [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) { + // check if zero memory + if (srcPtrs[a] == nullptr) { + return; + } + + size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 + + inputStrides[a][3] * n3 + inputStrides[a][4] * n4; + size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 + + outputStrides[3] * n3 + outputStrides[4] * n4; + const uint8_t* i = &srcPtrs[a][inOff]; + uint8_t* o = &dstPtr[dstOffset[a] + outOff]; #if defined(__GNUC__) - // Heuristic: - // memcpy works generally faster for data sizes not - // exceeding L1 cache. - if (nelemToCopy[a] > L1Size) { - // The code below performs data copying: o[e] = i[e] - // and uses a workaround to make GNU compilers optimize it - uint8_t* ptro = o; - const uint8_t* ptri = i; - // head part: bytes before 4 byte-align's address - const size_t headPart = - sizeof(uint32_t) - reinterpret_cast(ptro) % sizeof(uint32_t); - - // main part: bytes in 4 byte-align - const size_t mainPart = (nelemToCopy[a] - headPart) / sizeof(uint32_t); - // tail part: bytes after 4 byte-align - const size_t tailPart = (nelemToCopy[a]) - headPart - (mainPart * sizeof(uint32_t)); - // copy head part - for (size_t e = 0; e < headPart; ++e) { - *ptro = *ptri; - ++ptro; - ++ptri; - } - // copy main part - std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t)); - ptro += mainPart * sizeof(uint32_t); - ptri += mainPart * sizeof(uint32_t); - // copy tail part - for (size_t e = 0; e < tailPart; ++e) { - *ptro = *ptri; - ++ptro; - ++ptri; - } - } else { - std::memcpy(o, i, nelemToCopy[a]); - } + // Heuristic: + // memcpy works generally faster for data sizes not + // exceeding L1 cache. + if (nelemToCopy[a] > L1Size) { + // The code below performs data copying: o[e] = i[e] + // and uses a workaround to make GNU compilers optimize it + uint8_t* ptro = o; + const uint8_t* ptri = i; + // head part: bytes before 4 byte-align's address + const size_t headPart = sizeof(uint32_t) - reinterpret_cast(ptro) % sizeof(uint32_t); + + // main part: bytes in 4 byte-align + const size_t mainPart = (nelemToCopy[a] - headPart) / sizeof(uint32_t); + // tail part: bytes after 4 byte-align + const size_t tailPart = (nelemToCopy[a]) - headPart - (mainPart * sizeof(uint32_t)); + // copy head part + for (size_t e = 0; e < headPart; ++e) { + *ptro = *ptri; + ++ptro; + ++ptri; + } + // copy main part + std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t)); + ptro += mainPart * sizeof(uint32_t); + ptri += mainPart * sizeof(uint32_t); + // copy tail part + for (size_t e = 0; e < tailPart; ++e) { + *ptro = *ptri; + ++ptro; + ++ptri; + } + } else { + std::memcpy(o, i, nelemToCopy[a]); + } #else - std::memcpy(o, i, nelemToCopy[a]); + std::memcpy(o, i, nelemToCopy[a]); #endif - }); + }); } } diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp index 44103f5e115a87..198ec287473104 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp @@ -90,6 +90,7 @@ void CTCGreedyDecoder::execute([[maybe_unused]] const dnnl::stream& strm) { const auto* probabilities = getSrcDataAtPortAs(DATA_INDEX); const auto* sequenceMask = getSrcDataAtPortAs(SEQUENCE_LENGTH_INDEX); auto* outputSequences = getDstDataAtPortAs(0); + const auto& cpu_parallel = context->getCpuParallel(); const size_t T = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0]; const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[1]; @@ -100,7 +101,7 @@ void CTCGreedyDecoder::execute([[maybe_unused]] const dnnl::stream& strm) { const int blankIndex = C - 1; std::vector sequenceLengths(B, 0); - parallel_for(B, [&](size_t b) { + cpu_parallel->parallel_for(B, [&](size_t b) { size_t t = 0; for (; t < T; t++) { if (sequenceMask[B * t + b] == 0.F) { @@ -168,7 +169,7 @@ void CTCGreedyDecoder::execute([[maybe_unused]] const dnnl::stream& strm) { parallel_nt(0, threadBody); - parallel_for(B, [&](size_t b) { + cpu_parallel->parallel_for(B, [&](size_t b) { float prevClassIdx = -1.0F; size_t outputIndex = b * T; const size_t sequenceLength = sequenceLengths[b]; diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp index 34b60583f45226..2085e20c5ac919 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp @@ -95,6 +95,7 @@ void CTCGreedyDecoderSeqLen::execute([[maybe_unused]] const dnnl::stream& strm) const auto* sequenceLengths = getSrcDataAtPortAs(SEQUENCE_LENGTH_INDEX); auto* decodedClasses = getDstDataAtPortAs(DECODED_CLASSES_INDEX); auto* decodedClassesLength = getDstDataAtPortAs(DECODED_CLASSES_LENGTH_INDEX); + const auto& cpu_parallel = context->getCpuParallel(); const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0]; ; @@ -169,7 +170,7 @@ void CTCGreedyDecoderSeqLen::execute([[maybe_unused]] const dnnl::stream& strm) parallel_nt(0, threadBody); - parallel_for(B, [&](size_t b) { + cpu_parallel->parallel_for(B, [&](size_t b) { int prevClassIdx = -1; size_t outputIndex = b * T; const size_t actualSeqLen = sequenceLengths[b]; diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.cpp b/src/plugins/intel_cpu/src/nodes/def_conv.cpp index 1036d8b310d666..4757c0ea6c80bd 100644 --- a/src/plugins/intel_cpu/src/nodes/def_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/def_conv.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -1051,7 +1050,7 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights(const float* } }; - parallel_nd(MB, DG, OH, OW, [&](dim_t mb, dim_t dg, dim_t oh, dim_t ow) { + parallel_for4d(MB, DG, OH, OW, [&](dim_t mb, dim_t dg, dim_t oh, dim_t ow) { precompKer(static_cast(mb), static_cast(dg), static_cast(oh), static_cast(ow)); }); } @@ -1133,7 +1132,7 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor( jcp.ur_w = mayiuse(cpu::x64::avx512_core) ? 6 : 3; jcp.nb_oc_blocking = !mayiuse(cpu::x64::avx2) ? 2 : 4; - jcp.nthr = dnnl_get_max_threads(); + jcp.nthr = parallel_get_max_threads(); } DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor( @@ -1230,7 +1229,7 @@ void DeformableConvolution::DefConvRefExecutor::exec(const float* src, return d; }; - parallel_nd(G, MB, OC, OH, OW, [&](dnnl_dim_t g, dnnl_dim_t mb, dnnl_dim_t oc, dnnl_dim_t oh, dnnl_dim_t ow) { + parallel_for5d(G, MB, OC, OH, OW, [&](dnnl_dim_t g, dnnl_dim_t mb, dnnl_dim_t oc, dnnl_dim_t oh, dnnl_dim_t ow) { dst[mb * dstStrides[0] + (g * OC + oc) * dstStrides[1] + oh * dstStrides[2] + ow * dstStrides[3]] = compKer(static_cast(g), static_cast(mb), diff --git a/src/plugins/intel_cpu/src/nodes/detection_output.cpp b/src/plugins/intel_cpu/src/nodes/detection_output.cpp index 8eedfe8ae2241a..a743df1cc8494f 100644 --- a/src/plugins/intel_cpu/src/nodes/detection_output.cpp +++ b/src/plugins/intel_cpu/src/nodes/detection_output.cpp @@ -25,7 +25,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "shape_inference/shape_inference_cpu.hpp" @@ -197,6 +196,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) { const auto* priorData = getSrcDataAtPortAs(ID_PRIOR); const float* ARMConfData = inputShapes.size() > 3 ? getSrcDataAtPortAs(ID_ARM_CONF) : nullptr; const float* ARMLocData = inputShapes.size() > 4 ? getSrcDataAtPortAs(ID_ARM_LOC) : nullptr; + const auto& cpu_parallel = context->getCpuParallel(); float* reorderedConfData = reorderedConf.data(); auto* reorderedConfDataIndices = reinterpret_cast(reorderedConf.data()); @@ -356,7 +356,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) { for (int n = 0; n < imgNum; ++n) { if (!decreaseClassId) { // Caffe style - parallel_for(classesNum, [&](int c) { + cpu_parallel->parallel_for(classesNum, [&](int c) { if (c != backgroundClassId) { // Ignore background class const int off = n * priorsNum * classesNum + c * priorsNum; const float* pconfReorder = reorderedConfData + off; @@ -401,7 +401,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) { } int detectionsTotal = 0; - detectionsTotal = parallel_sum(classesNum, detectionsTotal, [&](size_t c) -> int { + detectionsTotal = cpu_parallel->parallel_sum(classesNum, detectionsTotal, [&](size_t c) -> int { return detectionsData[n * classesNum + c]; }); @@ -410,7 +410,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) { std::vector>> confIndicesClassMap; std::mutex mtx; - parallel_for(classesNum, [&](int c) { + cpu_parallel->parallel_for(classesNum, [&](int c) { const int detections = detectionsData[n * classesNum + c]; int* pindices = indicesData + n * classesNum * priorsNum + c * priorsNum; @@ -478,7 +478,8 @@ inline void DetectionOutput::confFilterMX(const float* confData, int* detectionsData, const int& n) { std::mutex mtx; - parallel_for(numPriorsActual[n], [&](size_t p) { + const auto& cpu_parallel = context->getCpuParallel(); + cpu_parallel->parallel_for(numPriorsActual[n], [&](size_t p) { // in: origin conf // out: pindices, detectionCount // intentionally code branch from higher level @@ -553,8 +554,9 @@ inline void DetectionOutput::getActualPriorNum(const float* priorData, int* numP inline void DetectionOutput::confReorderDense(const float* confData, const float* ARMConfData, float* reorderedConfData) const { + const auto& cpu_parallel = context->getCpuParallel(); if (withAddBoxPred) { - parallel_for2d(imgNum, priorsNum, [&](size_t n, size_t p) { + cpu_parallel->parallel_for2d(imgNum, priorsNum, [&](size_t n, size_t p) { if (ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore) { for (int c = 0; c < classesNum; ++c) { reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = @@ -570,7 +572,7 @@ inline void DetectionOutput::confReorderDense(const float* confData, return; } // withAddBoxPred is false - parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) { + cpu_parallel->parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) { const int offset = n * priorsNum * classesNum; for (int p = 0; p < priorsNum; ++p) { reorderedConfData[offset + c * priorsNum + p] = confData[offset + p * classesNum + c]; @@ -584,6 +586,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat [[maybe_unused]] int* indicesData, int* indicesBufData, int* detectionsData) { + const auto& cpu_parallel = context->getCpuParallel(); auto* reorderedConfDataIndices = reinterpret_cast(reorderedConfData); for (int n = 0; n < imgNum; ++n) { const int off = n * priorsNum * classesNum; @@ -591,13 +594,13 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat const int offH = n * confInfoLen * classesNum; // horizontal info // reset count - parallel_for(classesNum, [&](size_t c) { + cpu_parallel->parallel_for(classesNum, [&](size_t c) { const int countIdx = offH + c * confInfoLen + priorsNum; reorderedConfDataIndices[countIdx] = 0; }); std::mutex mtx; - parallel_for(numPriorsActual[n], [&](size_t p) { + cpu_parallel->parallel_for(numPriorsActual[n], [&](size_t p) { // intentionally code branch from higher level if (withAddBoxPred) { const bool isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore; @@ -649,7 +652,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat } }); // topk - parallel_for(classesNum, [&](size_t c) { + cpu_parallel->parallel_for(classesNum, [&](size_t c) { // in: conf_h info // out: buffer, detectionCount(k) if (c == static_cast(backgroundClassId)) { // Ignore background class @@ -675,12 +678,13 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat int* indicesData, int* indicesBufData, int* detectionsData) { + const auto& cpu_parallel = context->getCpuParallel(); for (int n = 0; n < imgNum; ++n) { const int off = n * priorsNum * classesNum; const int offV = n * priorsNum; // vertical info std::mutex mtx; - parallel_for(numPriorsActual[n], [&](size_t p) { + cpu_parallel->parallel_for(numPriorsActual[n], [&](size_t p) { bool isARMPrior = false; if (withAddBoxPred) { isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore; @@ -751,13 +755,14 @@ inline void DetectionOutput::decodeBBoxes(const float* priorData, const int* confInfoH, const int* confInfoV) const { int prNum = numPriorsActual[n]; + const auto& cpu_parallel = context->getCpuParallel(); if (!decodeType) { prNum = priorsNum; } if (isSparsityWorthwhile && !isShareLoc && !decreaseClassId && confInfoH[priorsNum] == 0) { return; } - parallel_for(prNum, [&](int p) { + cpu_parallel->parallel_for(prNum, [&](int p) { if (isSparsityWorthwhile && isShareLoc && confInfoV[p] == -1) { return; } diff --git a/src/plugins/intel_cpu/src/nodes/dft.cpp b/src/plugins/intel_cpu/src/nodes/dft.cpp index 22cf03f072b072..f772277a49f771 100644 --- a/src/plugins/intel_cpu/src/nodes/dft.cpp +++ b/src/plugins/intel_cpu/src/nodes/dft.cpp @@ -322,6 +322,7 @@ void DFT::dftNd(float* output, bool inverse) const { const std::vector iterationRange(outputShape.begin(), outputShape.end() - 1); const size_t lastDimIndex = iterationRange.size() - 1; + const auto& cpu_parallel = context->getCpuParallel(); for (size_t currentAxis : axes) { const size_t outputComplexLen = outputShape[currentAxis]; const size_t outputLen = outputComplexLen * 2; @@ -330,7 +331,7 @@ void DFT::dftNd(float* output, if (IsPowerOfTwo(outputComplexLen)) { size_t parallelDimIndex = lastDimIndex == currentAxis ? lastDimIndex - 1 : lastDimIndex; do { - parallel_for(iterationRange[parallelDimIndex], [&](size_t dim) { + cpu_parallel->parallel_for(iterationRange[parallelDimIndex], [&](size_t dim) { std::vector gatheredData(outputLen * 2); auto parallelIterationCounter = iterationCounter; parallelIterationCounter[parallelDimIndex] = dim; @@ -377,6 +378,7 @@ void DFT::fft(float* inBuffer, static int cacheSizeL3 = dnnl::utils::get_cache_size(3, false); static int elementsPerCacheLine = cacheSizeL3 / sizeof(float); size_t nComplex = dataLength / 2; + const auto& cpu_parallel = context->getCpuParallel(); std::function blockIteration; if (fftKernel != nullptr) { @@ -428,7 +430,7 @@ void DFT::fft(float* inBuffer, blockSize = nextIterationBlockSize; nextIterationBlockSize /= 2; if (parallelize && blockSize >= 4 * static_cast(elementsPerCacheLine)) { - parallel_for(numBlocks, [&](const size_t block) { + cpu_parallel->parallel_for(numBlocks, [&](const size_t block) { blockIteration(block, 1, nextIterationBlockSize); }); } else { @@ -455,6 +457,7 @@ void DFT::naiveDFT(float* data, size_t dataLength, bool inverse) const { CPU_NODE_THROW("Twiddles for nComplex=", nComplex, " not found"); } const auto& twiddles = twiddlesIter->second; + const auto& cpu_parallel = context->getCpuParallel(); std::function blockIteration; if (dftKernel != nullptr) { @@ -500,7 +503,7 @@ void DFT::naiveDFT(float* data, size_t dataLength, bool inverse) const { }; } - parallel_for(nComplex, blockIteration); + cpu_parallel->parallel_for(nComplex, blockIteration); cpu_memcpy(data, outputBuffer.data(), dataLength * sizeof(float)); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp index 246b5db1a7476d..b7184de8771ddf 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp @@ -49,6 +49,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" #include "post_ops.hpp" +#include "thread_pool_imp.hpp" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" @@ -151,7 +152,7 @@ std::optional acl_fc_executor::reorderDataFallback(const MemoryPtr& i auto convertOutput = *convertOutputOpt; if (reorderWithoutConvert) { - dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order); + dnnl::stream loc_stream = make_stream(output->getPrimitive().get_engine(), context->getThreadPool()); reorderWithoutConvert.execute( loc_stream, {{DNNL_ARG_FROM, convertOutput->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); @@ -198,7 +199,7 @@ MemoryPtr acl_fc_executor::reorderData(const DnnlMemoryDescPtr& srcWeightDesc, } // if precision conversion does not work then do direct reference reorder if (directReorder) { - dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order); + dnnl::stream loc_stream = make_stream(engine, context->getThreadPool()); directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); } else { diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp index 4628a6e7686faa..3fafe60c8f6c2d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp @@ -47,6 +47,7 @@ #include "openvino/core/type/element_type.hpp" #include "post_ops.hpp" #include "shape_inference/custom/convolution.hpp" +#include "thread_pool_imp.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" @@ -864,6 +865,7 @@ std::shared_ptr DnnlConvolutionPrimitive::create( auto builder = [&context, defaultImplType](const Key& dnnlKey) { return std::make_shared(dnnlKey, context->getEngine(), + context->getThreadPool(), context->getImplPriorities(), defaultImplType); }; @@ -1013,9 +1015,10 @@ bool DnnlConvolutionPrimitive::isNspcAvailable(const ConvConfig& config) { DnnlConvolutionPrimitive::DnnlConvolutionPrimitive(const Key& key, const dnnl::engine& engine, + const std::shared_ptr& threadPool, const std::vector& implPriorities, const impl_desc_type defaultImplType) - : m_stream(dnnl::stream(engine)), + : m_stream(make_stream(engine, threadPool)), m_primDesc(createPrimitiveDesc(key.src->getDnnlDesc(), key.wei->getDnnlDesc(), key.bias->getDnnlDesc(), diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp index 2f80c0c8460994..3a002cdca6a6aa 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp @@ -20,6 +20,7 @@ #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/memory_arguments.hpp" #include "onednn/iml_type_mapper.h" +#include "thread_pool_imp.hpp" namespace ov::intel_cpu { @@ -65,6 +66,7 @@ class DnnlConvolutionPrimitive { public: DnnlConvolutionPrimitive(const Key& key, const dnnl::engine& engine, + const std::shared_ptr& threadPool, const std::vector& implPriorities, impl_desc_type defaultImplType); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index aa7680275bebe8..1d502efc6ec9a3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -37,6 +37,7 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" +#include "thread_pool_imp.hpp" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" @@ -107,7 +108,10 @@ std::shared_ptr DnnlFCPrimitive::create(const MemoryArgs& memor attrs.modelType}; auto builder = [&context](const Key& dnnlKey) { - return std::make_shared(dnnlKey, context->getEngine(), context->getImplPriorities()); + return std::make_shared(dnnlKey, + context->getEngine(), + context->getThreadPool(), + context->getImplPriorities()); }; auto runtimeCache = context->getRuntimeCache(); @@ -479,8 +483,9 @@ static impl_desc_type implTypeFromPrimDesc(const dnnl::primitive_desc& primDesc) DnnlFCPrimitive::DnnlFCPrimitive(const Key& key, const dnnl::engine& engine, + const std::shared_ptr& threadPool, const std::vector& implPriorities) - : m_stream(dnnl::stream(engine)), + : m_stream(make_stream(engine, threadPool)), m_primDesc(createPrimitiveDesc( key.src->getDnnlDesc(), key.wei->getDnnlDesc(), diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp index ed381da5559a22..f8638e8b2efa53 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp @@ -19,6 +19,7 @@ #include "nodes/executors/memory_arguments.hpp" #include "onednn/iml_type_mapper.h" #include "openvino/core/type/element_type.hpp" +#include "thread_pool_imp.hpp" namespace ov::intel_cpu { @@ -37,7 +38,10 @@ class DnnlFCPrimitive { }; public: - DnnlFCPrimitive(const Key& key, const dnnl::engine& engine, const std::vector& implPriorities); + DnnlFCPrimitive(const Key& key, + const dnnl::engine& engine, + const std::shared_ptr& threadPool, + const std::vector& implPriorities); void execute(const dnnl_primitive_args& primArgs) const; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp index e4695ad7e2d96b..57cd6599bf3378 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp @@ -37,6 +37,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" #include "post_ops.hpp" +#include "thread_pool_imp.hpp" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" @@ -137,6 +138,7 @@ std::shared_ptr DnnlMatMulPrimitive::create(const MemoryArg auto builder = [&context, defaultImplType](const Key& dnnlKey) { return std::make_shared(dnnlKey, context->getEngine(), + context->getThreadPool(), context->getImplPriorities(), defaultImplType); }; @@ -572,9 +574,10 @@ static impl_desc_type implTypeFromPrimDesc(const dnnl::primitive_desc& primDesc) DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key, const dnnl::engine& engine, + const std::shared_ptr& threadPool, const std::vector& implPriorities, const impl_desc_type defaultImplType) - : m_stream(dnnl::stream(engine)), + : m_stream(make_stream(engine, threadPool)), m_primDesc(createPrimitiveDesc(key.src->getDnnlDesc(), key.wei->getDnnlDesc(), key.bias->getDnnlDesc(), diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp index 826e13e8ee3082..29a5412d925459 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp @@ -19,6 +19,7 @@ #include "nodes/executors/memory_arguments.hpp" #include "onednn/iml_type_mapper.h" #include "openvino/core/type/element_type.hpp" +#include "thread_pool_imp.hpp" namespace ov::intel_cpu { @@ -41,6 +42,7 @@ class DnnlMatMulPrimitive { public: DnnlMatMulPrimitive(const Key& key, const dnnl::engine& engine, + const std::shared_ptr& threadPool, const std::vector& implPriorities, impl_desc_type defaultImplType); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp index 79bb26a4383fd0..7719c4270c1810 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp @@ -22,6 +22,7 @@ #include "nodes/reorder.h" #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" +#include "thread_pool_imp.hpp" #include "weights_cache.hpp" namespace ov::intel_cpu::utils { @@ -41,6 +42,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc, context->getRuntimeCache(), context->getWeightsCache(), privateWeightCache, + context->getThreadPool(), needShiftSignedToUnsigned); } @@ -51,6 +53,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc, const MultiCachePtr& rtCache, const WeightsSharing::Ptr& globalWeightCache, const std::shared_ptr>& privateWeightCache, + const std::shared_ptr& threadPool, bool needShiftSignedToUnsigned) { const auto format = dstWeightDesc->serializeFormat(); if (privateWeightCache) { @@ -71,7 +74,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc, // prevent reorderData from doing conversion Memory srcMemory{eng, srcWeightDesc->cloneWithNewPrecision(dst_wdt), weightsMem->getData()}; MemoryPtr _ptr = std::make_shared(eng, dstWeightDesc); - node::Reorder::reorderData(srcMemory, *_ptr, rtCache); + node::Reorder::reorderData(srcMemory, *_ptr, rtCache, threadPool); // do shift auto count = _ptr->getSize() / _ptr->getDesc().getPrecision().size(); @@ -95,7 +98,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc, Memory srcMemory{eng, srcWeightDesc, weightsMem->getData()}; MemoryPtr _ptr = std::make_shared(eng, dstWeightDesc); - node::Reorder::reorderData(srcMemory, *_ptr, rtCache); + node::Reorder::reorderData(srcMemory, *_ptr, rtCache, threadPool); return _ptr; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp index 1df2bdcbf2edfa..03edaa3e654a2f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp @@ -31,5 +31,6 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc, const MultiCachePtr& rtCache, const WeightsSharing::Ptr& globalWeightCache, const std::shared_ptr>& privateWeightCache, + const std::shared_ptr& threadPool, bool needShiftSignedToUnsigned = false); } // namespace ov::intel_cpu::utils diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp index cf22fe07f08442..c877f4d6b9ab13 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp @@ -61,7 +61,8 @@ class ExecutorContext { engine(graphContext->getEngine()), implPriorities(std::move(implPriorities)), privateWeighCache(std::move(privateWeighCache)), - numNumaNodes(graphContext->getNumNumaNodes()) { + numNumaNodes(graphContext->getNumNumaNodes()), + cpuParallel(graphContext->getCpuParallel()) { auto cpuStreamsExecutor = graphContext->getCPUStreamExecutor(); curNumaNodeId = std::max(0, cpuStreamsExecutor ? cpuStreamsExecutor->get_numa_node_id() : curNumaNodeId); } @@ -92,6 +93,10 @@ class ExecutorContext { return weightsCache; } + [[nodiscard]] std::shared_ptr getThreadPool() const { + return cpuParallel->get_thread_pool(); + } + private: // weak_ptr is required to avoid cycle dependencies with MultiCache // since ExecutorContext is stored in Executor itself @@ -104,6 +109,7 @@ class ExecutorContext { std::shared_ptr> privateWeighCache; int numNumaNodes; int curNumaNodeId = -1; + std::shared_ptr cpuParallel; }; class ExecutorFactoryLegacy { diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp index 562c9fe0c84734..6941fff88acf23 100644 --- a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp +++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -1700,6 +1699,7 @@ void FakeQuantize::createPrimitive() { } void FakeQuantize::executeReference() { + const auto& cpu_parallel = context->getCpuParallel(); auto srcMemory = getSrcMemoryAtPort(0); auto dstMemory = getDstMemoryAtPort(0); @@ -1745,7 +1745,7 @@ void FakeQuantize::executeReference() { const auto* thresholds = internalBlobMemory[0]->getDataAs(); const auto* output_mask = internalBlobMemory[1]->getDataAs(); - parallel_nd(N, CB, D, H, W, [&](dim_t n, dim_t cb, dim_t d, dim_t h, dim_t w) { + cpu_parallel->parallel_for5d(N, CB, D, H, W, [&](dim_t n, dim_t cb, dim_t d, dim_t h, dim_t w) { uint8_t bin_val = 0x00; for (int c = static_cast(cb) * nbits, shift = 0; c < std::min(static_cast(C), (static_cast(cb) + 1) * nbits); @@ -1778,7 +1778,7 @@ void FakeQuantize::executeReference() { } else { auto* dst = dstMemory->getDataAs(); - parallel_nd(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + cpu_parallel->parallel_for5d(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { size_t src_off = n * s_str[0]; if (srcDims.size() == 5) { src_off += c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4]; @@ -1826,6 +1826,7 @@ void FakeQuantize::executeReference() { } void FakeQuantize::executeBinarization(const std::unique_ptr& pKernel) const { #if defined(OPENVINO_ARCH_X86_64) + const auto& cpu_parallel = context->getCpuParallel(); auto srcMemory = getSrcMemoryAtPort(0); auto dstMemory = getDstMemoryAtPort(0); @@ -1852,7 +1853,7 @@ void FakeQuantize::executeBinarization(const std::unique_ptrparallel_for3d(N, H, W, [&](dim_t n, dim_t h, dim_t w) { auto arg = jit_quantize_call_args(); arg.from = &src[(n * s_str[0] + h * s_str[2] + w * s_str[3]) * sizeof(float)]; @@ -1868,6 +1869,7 @@ void FakeQuantize::executeBinarization(const std::unique_ptr& pKernel) const { #if defined(OPENVINO_ARCH_X86_64) + const auto& cpu_parallel = context->getCpuParallel(); auto srcMemory = getSrcMemoryAtPort(0); auto dstMemory = getDstMemoryAtPort(0); @@ -1924,7 +1926,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptrparallel_for3d(N, CB, D, [&](dim_t n, dim_t cb, [[maybe_unused]] dim_t d) { auto arg = jit_quantize_call_args(); int c = static_cast(cb) * blk_size; @@ -1955,7 +1957,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptr 2) { const int batch_size = 256; const int B = div_up(H * W, batch_size); - parallel_nd(N, CB, D, B, [&](dim_t n, dim_t cb, dim_t d, dim_t b) { + cpu_parallel->parallel_for4d(N, CB, D, B, [&](dim_t n, dim_t cb, dim_t d, dim_t b) { auto arg = jit_quantize_call_args(); const int c = static_cast(cb) * blk_size; @@ -1990,7 +1992,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptrparallel_for4d(N, CB, D, H, [&](dim_t n, dim_t cb, dim_t d, dim_t h) { auto arg = jit_quantize_call_args(); int c = static_cast(cb) * blk_size; diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 4ac0de259b88fa..cee2e9be3a1db5 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -38,7 +38,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/threading/cpu_message.hpp" @@ -323,6 +322,7 @@ void FullyConnected::initTensorParallelSync() { void FullyConnected::execTensorParallelSync() { if (tp_cfg.enable_tensor_parallel) { + const auto& cpu_parallel = context->getCpuParallel(); // dst auto dst = getDstMemoryAtPort(0); auto* dst_ptr = static_cast(dst->getData()); @@ -364,7 +364,7 @@ void FullyConnected::execTensorParallelSync() { const auto copySize = splited_dim_vec[idx] * prec.size(); // bytes of half selected dim. const size_t unloop = 8; size_t step = count / unloop; - parallel_for(step, [&](size_t i) { + cpu_parallel->parallel_for(step, [&](size_t i) { cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop) * channel_size, new_ptr + (i * unloop) * copySize, copySize); diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index f99cc6a469efb9..808ec3198968a0 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -680,6 +680,7 @@ void Gather::initShortParams(threadExecParams& p, const uint64_t start) { template void Gather::execCompressed4Bit() { + const auto& cpu_parallel = context->getCpuParallel(); const auto* srcIndices = getSrcDataAtPortAs(GATHER_INDICES); const auto* srcData = getSrcDataAtPortAs(GATHER_DATA); auto* dstData = getDstDataAtPortAs(0); @@ -690,7 +691,7 @@ void Gather::execCompressed4Bit() { const auto* scale = getSrcDataAtPortAs(GATHER_SCALE); const size_t dstAfterBatchSize = betweenBatchAndAxisSize * specIdxAndAfterAxSize; - parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) { + cpu_parallel->parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) { int ii = srcIndices[b * specIndicesSize + j]; if (ii < 0) { if (reverseIndexing) { @@ -764,6 +765,7 @@ void Gather::execCompressed4Bit() { template void Gather::execCompressed8Bit() { + const auto& cpu_parallel = context->getCpuParallel(); const auto* srcIndices = getSrcDataAtPortAs(GATHER_INDICES); const auto* srcData = getSrcDataAtPortAs(GATHER_DATA); auto* dstData = getDstDataAtPortAs(0); @@ -775,7 +777,7 @@ void Gather::execCompressed8Bit() { const size_t dstAfterBatchSize = betweenBatchAndAxisSize * specIdxAndAfterAxSize; - parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) { + cpu_parallel->parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) { int ii = srcIndices[b * specIndicesSize + j]; if (ii < 0) { if (reverseIndexing) { @@ -925,12 +927,13 @@ void Gather::execCompressed() { } void Gather::execReference() { + const auto& cpu_parallel = context->getCpuParallel(); const auto* srcIndices = getSrcDataAtPortAs(GATHER_INDICES); const auto* srcData = getSrcDataAtPortAs(GATHER_DATA); auto* dstData = getDstDataAtPortAs(0); const size_t dstAfterBatchSize = betweenBatchAndAxisSize * specIdxAndAfterAxSizeBOut; - parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) { + cpu_parallel->parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) { int ii = srcIndices[b * specIndicesSize + j]; if (ii < 0) { if (reverseIndexing) { diff --git a/src/plugins/intel_cpu/src/nodes/grn.cpp b/src/plugins/intel_cpu/src/nodes/grn.cpp index 0c4d8a643fa7be..f75527bd91e57c 100644 --- a/src/plugins/intel_cpu/src/nodes/grn.cpp +++ b/src/plugins/intel_cpu/src/nodes/grn.cpp @@ -17,7 +17,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/grn.hpp" @@ -102,10 +101,11 @@ void GRN::executeDynamicImpl(const dnnl::stream& strm) { } void GRN::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); const auto* src_data = getSrcDataAtPortAs(0); auto* dst_data = getDstDataAtPortAs(0); - parallel_for3d(N, H, W, [&](int b, int h, int w) { + cpu_parallel->parallel_for3d(N, H, W, [&](int b, int h, int w) { double variance = 0; for (int c = 0; c < C; c++) { variance += std::pow(src_data[b * C * H * W + c * H * W + h * W + w], 2); diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 0393c2390d2d3a..437a0c8f335c00 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -51,7 +51,6 @@ # include # include "cpu/x64/jit_generator.hpp" -# include "openvino/core/parallel.hpp" using namespace dnnl::impl::cpu::x64; using namespace Xbyak; @@ -452,6 +451,7 @@ void Input::cloneBlobIfRequired() { #if defined(OPENVINO_ARCH_X86_64) auto fn = jit_has_subnormals_function(); auto fn_bf16_check = jit_has_bf16_overflows_function(); + const auto& cpu_parallel = context->getCpuParallel(); if (fn && fn_bf16_check) { static const size_t batch_size = 2048; const size_t iterations_num = size / batch_size + 1; @@ -459,7 +459,7 @@ void Input::cloneBlobIfRequired() { std::atomic has_subnormals_local(false); std::atomic has_bf16_overflows_local(false); if (needFlushDenormalsToZero || do_bf16_saturation_check) { - parallel_for(iterations_num, [&](int n) { + cpu_parallel->parallel_for(iterations_num, [&](int n) { const auto* ptr = f32data + n * batch_size; jit_has_special_value_base::args_t args = { reinterpret_cast(ptr), diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index b5db22802b5ac1..cb24258df6c509 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -2662,6 +2662,7 @@ std::vector Interpolate::getScales(const VectorDims& srcDimPad, const Vec } void Interpolate::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); auto dstMemPtr = getDstMemoryAtPort(0); auto srcMemPtr = getSrcMemoryAtPort(DATA_ID); @@ -2690,35 +2691,46 @@ void Interpolate::execute([[maybe_unused]] const dnnl::stream& strm) { if (interpAttrs.layout == InterpolateLayoutType::planar) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); auto* src_data_pad = static_cast(srcPadded.data()); - parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) { - const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + - inShapeBlock[3] * d + inShapeBlock[4] * h) * - srcDataSize; - uint8_t* srcPad = - src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + - inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * - srcDataSize; - cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); - }); + cpu_parallel->parallel_for4d( + srcDim5d[0], + srcDim5d[1], + srcDim5d[2], + srcDim5d[3], + [&](int n, int c, int d, int h) { + const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + + inShapeBlock[3] * d + inShapeBlock[4] * h) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + + inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * + srcDataSize; + cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); + }); src_data = src_data_pad; } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); auto* src_data_pad = static_cast(srcPadded.data()); - parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) { - const uint8_t* src = - src_data_origin + - (inShapeBlock[1] * n + - (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * - srcDataSize; - uint8_t* srcPad = - src_data_pad + (inShapePadBlock[1] * (n + padB0) + - (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + - inShapePadBlock[5] * (w + padB4)) * - srcDimPad5d[1] + - padB1) * - srcDataSize; - cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); - }); + cpu_parallel->parallel_for4d( + srcDim5d[0], + srcDim5d[2], + srcDim5d[3], + srcDim5d[4], + [&](int n, int d, int h, int w) { + const uint8_t* src = + src_data_origin + + (inShapeBlock[1] * n + + (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + (inShapePadBlock[1] * (n + padB0) + + (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + + inShapePadBlock[5] * (w + padB4)) * + srcDimPad5d[1] + + padB1) * + srcDataSize; + cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); + }); src_data = src_data_pad; } else if (interpAttrs.layout == InterpolateLayoutType::block) { size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; @@ -2728,28 +2740,28 @@ void Interpolate::execute([[maybe_unused]] const dnnl::stream& strm) { auto* src_data_pad = static_cast(srcPadded.data()); CPU_NODE_ASSERT((srcDim5d[0] == srcDimPad5d[0]) && (srcDim5d[1] == srcDimPad5d[1]), "does not support padding on batch and channel dimensions"); - parallel_for5d(srcDim5d[0], - CB, - srcDim5d[2], - srcDim5d[3], - srcDim5d[4], - [&](int n, int cb, int d, int h, int w) { - const uint8_t* src = - src_data_origin + - (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + - (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + - (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + - (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize; - uint8_t* srcPad = - src_data_pad + - (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * - srcDataSize + - (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + - ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + - ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize + - ((w + padB4) * blkSize) * srcDataSize; - cpu_memcpy(srcPad, src, blkSize * srcDataSize); - }); + cpu_parallel->parallel_for5d( + srcDim5d[0], + CB, + srcDim5d[2], + srcDim5d[3], + srcDim5d[4], + [&](int n, int cb, int d, int h, int w) { + const uint8_t* src = + src_data_origin + + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize; + uint8_t* srcPad = + src_data_pad + + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize + + ((w + padB4) * blkSize) * srcDataSize; + cpu_memcpy(srcPad, src, blkSize * srcDataSize); + }); src_data = src_data_pad; } } else { diff --git a/src/plugins/intel_cpu/src/nodes/inverse.cpp b/src/plugins/intel_cpu/src/nodes/inverse.cpp index 387e83fa6d188a..012dff080ba04a 100644 --- a/src/plugins/intel_cpu/src/nodes/inverse.cpp +++ b/src/plugins/intel_cpu/src/nodes/inverse.cpp @@ -21,7 +21,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/partial_shape.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" @@ -124,17 +123,18 @@ void Inverse::lu_decomposition(const float* data, size_t b) const { // Make L identity, U a copy of data and P a range(0, side) const auto batch_idx = b * m_side_squared; + const auto& cpu_parallel = context->getCpuParallel(); std::fill(L.begin(), L.end(), 0.0F); if (!m_adjoint) { cpu_parallel_memcpy(U.data(), &data[batch_idx], sizeof(float) * m_side_squared); } else { - parallel_for2d(m_side, m_side, [&](size_t i, size_t j) { + cpu_parallel->parallel_for2d(m_side, m_side, [&](size_t i, size_t j) { U[j * m_side + i] = data[batch_idx + i * m_side + j]; }); } - parallel_for(m_side, [&](size_t i) { + cpu_parallel->parallel_for(m_side, [&](size_t i) { L[i * m_side + i] = 1.0F; P[i] = i; }); @@ -156,7 +156,7 @@ void Inverse::lu_decomposition(const float* data, if (pivot_row != k) { // Swap rows in L, U and P std::swap(P[k], P[pivot_row]); - parallel_for(m_side, [&](size_t i) { + cpu_parallel->parallel_for(m_side, [&](size_t i) { std::swap(L[k_idx + i], L[pivot_idx + i]); std::swap(U[k_idx + i], U[pivot_idx + i]); }); @@ -165,12 +165,12 @@ void Inverse::lu_decomposition(const float* data, const auto remaining_columns = m_side - k; const auto remaining_rows = remaining_columns - 1; - parallel_for(remaining_rows, [&](size_t i) { + cpu_parallel->parallel_for(remaining_rows, [&](size_t i) { const auto i_idx = (i + k + 1) * m_side; L[i_idx + k] = U[i_idx + k] / U[k_idx + k]; }); - parallel_for(remaining_rows * remaining_columns, [&](size_t i) { + cpu_parallel->parallel_for(remaining_rows * remaining_columns, [&](size_t i) { const auto i_idx = (i / remaining_columns + k + 1) * m_side; const auto j_idx = i % remaining_columns + k; U[i_idx + j_idx] = U[i_idx + j_idx] - L[i_idx + k] * U[k_idx + j_idx]; @@ -183,7 +183,8 @@ void Inverse::lu_solve(float* output, std::vector& U, std::vector& P, size_t b) const { - parallel_for(m_side, [&](size_t column) { + const auto& cpu_parallel = context->getCpuParallel(); + cpu_parallel->parallel_for(m_side, [&](size_t column) { std::vector X(m_side, 0.0F); std::vector Y(m_side, 0.0F); diff --git a/src/plugins/intel_cpu/src/nodes/log_softmax.cpp b/src/plugins/intel_cpu/src/nodes/log_softmax.cpp index 779a9d247279b9..d459cd322a7cf0 100644 --- a/src/plugins/intel_cpu/src/nodes/log_softmax.cpp +++ b/src/plugins/intel_cpu/src/nodes/log_softmax.cpp @@ -19,7 +19,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/log_softmax.hpp" @@ -107,11 +106,12 @@ void LogSoftmax::executeDynamicImpl(const dnnl::stream& strm) { } void LogSoftmax::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); const auto* srcData = getSrcDataAtPortAs(0); auto* dstData = getDstDataAtPortAs(0); if (isLastDim) { - parallel_for(axisStep, [&](size_t i) { + cpu_parallel->parallel_for(axisStep, [&](size_t i) { const float* srcDataPtr = &srcData[i * reducedAxisSize]; float* dstDataPtr = &dstData[i * reducedAxisSize]; @@ -127,7 +127,7 @@ void LogSoftmax::execute([[maybe_unused]] const dnnl::stream& strm) { } }); } else { - parallel_for2d(axisStep, reducedAxisStride, [&](size_t k, size_t i) { + cpu_parallel->parallel_for2d(axisStep, reducedAxisStride, [&](size_t k, size_t i) { const float* srcDataPtr = &srcData[k * reducedAxisStride * reducedAxisSize + i]; float* dstDataPtr = &dstData[k * reducedAxisStride * reducedAxisSize + i]; diff --git a/src/plugins/intel_cpu/src/nodes/mathematics.cpp b/src/plugins/intel_cpu/src/nodes/mathematics.cpp index 39321ffeb44131..c4a468c3f404ea 100644 --- a/src/plugins/intel_cpu/src/nodes/mathematics.cpp +++ b/src/plugins/intel_cpu/src/nodes/mathematics.cpp @@ -22,7 +22,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/abs.hpp" @@ -104,90 +103,91 @@ void Math::execute([[maybe_unused]] const dnnl::stream& strm) { size_t dataSize = getChildEdgeAt(0)->getMemory().getShape().getElementsCount(); const auto* src_data = getSrcDataAtPortAs(0); auto* dst_data = getDstDataAtPortAs(0); + const auto& cpu_parallel = context->getCpuParallel(); switch (getAlgorithm()) { case Algorithm::MathAbs: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = (std::abs)(src_data[i]); }); break; case Algorithm::MathAcos: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = acosf(src_data[i]); }); break; case Algorithm::MathAcosh: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = acoshf(src_data[i]); }); break; case Algorithm::MathAsin: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = asinf(src_data[i]); }); break; case Algorithm::MathAsinh: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = asinhf(src_data[i]); }); break; case Algorithm::MathAtan: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = atanf(src_data[i]); }); break; case Algorithm::MathAtanh: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = atanhf(src_data[i]); }); break; case Algorithm::MathCeiling: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = ceilf(src_data[i]); }); break; case Algorithm::MathCos: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = cosf(src_data[i]); }); break; case Algorithm::MathCosh: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = coshf(src_data[i]); }); break; case Algorithm::MathFloor: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = floorf(src_data[i]); }); break; case Algorithm::MathHardSigmoid: alpha = (alpha == 0.0F) ? 0.2F : alpha; beta = (beta == 0.0F) ? 0.5F : beta; - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = (std::max)(0.F, (std::min)(1.F, alpha * src_data[i] + beta)); }); break; case Algorithm::MathNegative: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = -src_data[i]; }); break; case Algorithm::MathReciprocal: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = 1.0F / src_data[i]; }); break; case Algorithm::MathSelu: alpha = (alpha == 0.0F) ? 1.67326F : alpha; gamma = (gamma == 0.0F) ? 1.0507F : gamma; - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { float x = src_data[i]; dst_data[i] = (x > 0.0F) ? (gamma * x) : (gamma * alpha * (std::exp(x) - 1.0F)); }); break; case Algorithm::MathSign: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { if (src_data[i] > 0.0F) { dst_data[i] = 1.0F; } else if (src_data[i] < 0.0F) { @@ -200,28 +200,28 @@ void Math::execute([[maybe_unused]] const dnnl::stream& strm) { }); break; case Algorithm::MathSin: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = sinf(src_data[i]); }); break; case Algorithm::MathSinh: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = sinhf(src_data[i]); }); break; case Algorithm::MathSoftPlus: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = logf(expf(src_data[i]) + 1); }); break; case Algorithm::MathSoftsign: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { float x = src_data[i]; dst_data[i] = x / (1.F + (std::abs)(x)); }); break; case Algorithm::MathTan: - parallel_for(dataSize, [&](size_t i) { + cpu_parallel->parallel_for(dataSize, [&](size_t i) { dst_data[i] = tanf(src_data[i]); }); break; diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp index 77d34e0e9434cd..cbfbdf3165dca3 100644 --- a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp @@ -216,9 +216,10 @@ size_t MatrixNms::nmsMatrix(const float* boxesData, std::vector iouMatrix((originalSize * (originalSize - 1)) >> 1); std::vector iouMax(originalSize); + const auto& cpu_parallel = context->getCpuParallel(); iouMax[0] = 0.; - ov::parallel_for(originalSize - 1, [&](size_t i) { + cpu_parallel->parallel_for(originalSize - 1, [&](size_t i) { float max_iou = 0.; size_t actual_index = i + 1; auto idx_a = candidateIndex[actual_index]; @@ -336,10 +337,11 @@ void MatrixNms::executeDynamicImpl(const dnnl::stream& strm) { } void MatrixNms::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); const auto* boxes = getSrcDataAtPortAs(NMS_BOXES); const auto* scores = getSrcDataAtPortAs(NMS_SCORES); - ov::parallel_for2d(m_numBatches, m_numClasses, [&](size_t batchIdx, size_t classIdx) { + cpu_parallel->parallel_for2d(m_numBatches, m_numClasses, [&](size_t batchIdx, size_t classIdx) { if (classIdx == static_cast(m_backgroundClass)) { m_numPerBatchClass[batchIdx][classIdx] = 0; return; @@ -356,7 +358,7 @@ void MatrixNms::execute([[maybe_unused]] const dnnl::stream& strm) { m_numPerBatchClass[batchIdx][classIdx] = classNumDet; }); - ov::parallel_for(m_numBatches, [&](size_t batchIdx) { + cpu_parallel->parallel_for(m_numBatches, [&](size_t batchIdx) { size_t batchOffset = batchIdx * m_realNumClasses * m_realNumBoxes; BoxInfo* batchFilteredBox = m_filteredBoxes.data() + batchOffset; auto& numPerClass = m_numPerBatchClass[batchIdx]; diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp index 3bce1b209a5baa..8cac1838e9bf19 100644 --- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp @@ -490,6 +490,7 @@ void MultiClassNms::nmsWithEta(const float* boxes, const VectorDims& scoresStrides, const VectorDims& roisnumStrides, const bool shared) { + const auto& cpu_parallel = context->getCpuParallel(); auto less = [](const boxInfo& l, const boxInfo& r) { return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx)); }; @@ -498,7 +499,7 @@ void MultiClassNms::nmsWithEta(const float* boxes, return iou <= adaptive_threshold ? 1.0F : 0.0F; }; - parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) { + cpu_parallel->parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) { if (!shared) { if (roisnum[batch_idx] <= 0) { m_numFiltBox[batch_idx][class_idx] = 0; @@ -608,7 +609,8 @@ void MultiClassNms::nmsWithoutEta(const float* boxes, const VectorDims& scoresStrides, const VectorDims& roisnumStrides, const bool shared) { - parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) { + const auto& cpu_parallel = context->getCpuParallel(); + cpu_parallel->parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) { /* // nms over a class over an image // boxes: num_priors, 4 diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.cpp b/src/plugins/intel_cpu/src/nodes/multinomial.cpp index 348428f85b6be0..c6add9472428c1 100644 --- a/src/plugins/intel_cpu/src/nodes/multinomial.cpp +++ b/src/plugins/intel_cpu/src/nodes/multinomial.cpp @@ -22,7 +22,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/type/float16.hpp" #include "openvino/op/multinomial.hpp" @@ -184,6 +183,7 @@ template void Multinomial::execute_convert_type() { const auto* probs = getSrcDataAtPortAs(PROBS_PORT); auto* output = getDstDataAtPortAs(OUTPUT_PORT); + const auto& cpu_parallel = context->getCpuParallel(); std::vector

m_cdf(m_input_elements_count); std::vector

m_max_per_batch(m_batches_count); @@ -191,7 +191,7 @@ void Multinomial::execute_convert_type() { // exp & cumsum if (m_log_probs) { - parallel_for(m_batches_count, [&](size_t idx) { + cpu_parallel->parallel_for(m_batches_count, [&](size_t idx) { const auto start_idx = idx * m_probs_count; m_cdf[start_idx] = std::exp(probs[start_idx]); for (size_t prev = start_idx, curr = prev + 1; curr < (start_idx + m_probs_count); ++prev, ++curr) { @@ -199,7 +199,7 @@ void Multinomial::execute_convert_type() { } }); } else { - parallel_for(m_batches_count, [&](size_t idx_batch) { + cpu_parallel->parallel_for(m_batches_count, [&](size_t idx_batch) { const auto start_idx = idx_batch * m_probs_count; const auto* probs_start_idx = probs + start_idx; std::partial_sum(probs_start_idx, probs_start_idx + m_probs_count, m_cdf.begin() + start_idx); @@ -222,17 +222,17 @@ void Multinomial::execute_convert_type() { // max & divide const auto min_value_of_max = std::numeric_limits

::min(); - parallel_for(m_batches_count, [&](size_t idx) { + cpu_parallel->parallel_for(m_batches_count, [&](size_t idx) { m_max_per_batch[idx] = std::max(m_cdf[(idx + 1) * m_probs_count - 1], min_value_of_max); }); - parallel_for(m_input_elements_count, [&](size_t idx) { + cpu_parallel->parallel_for(m_input_elements_count, [&](size_t idx) { size_t idx_max_elem = idx / m_probs_count; m_cdf[idx] = m_cdf[idx] / m_max_per_batch[idx_max_elem]; }); if (m_with_replacement) { - parallel_for(m_batches_samples_probs_count, [&](size_t idx) { + cpu_parallel->parallel_for(m_batches_samples_probs_count, [&](size_t idx) { size_t idx_batch = idx / m_samples_probs_count; size_t idx_num_samples_probs = idx % m_samples_probs_count; size_t idx_prob = idx_num_samples_probs % m_probs_count; @@ -246,7 +246,7 @@ void Multinomial::execute_convert_type() { } }); } else { // without replacement - adjust cdf after each sample drawn from batch, sequentially - parallel_for(m_batches_count, [&](size_t idx_batch) { + cpu_parallel->parallel_for(m_batches_count, [&](size_t idx_batch) { for (size_t idx_sample = 0LU; idx_sample < m_samples_count; ++idx_sample) { size_t idx_input = idx_batch * m_probs_count; size_t idx_output = idx_batch * m_samples_count + idx_sample; diff --git a/src/plugins/intel_cpu/src/nodes/ngram.cpp b/src/plugins/intel_cpu/src/nodes/ngram.cpp index 892bad8c8c7889..1ecf5daeffff7b 100644 --- a/src/plugins/intel_cpu/src/nodes/ngram.cpp +++ b/src/plugins/intel_cpu/src/nodes/ngram.cpp @@ -23,7 +23,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "shape_inference/custom/ngram.hpp" @@ -118,6 +117,7 @@ std::vector Ngram::computeBatchLenghts() { void Ngram::execute([[maybe_unused]] const dnnl::stream& strm) { const auto* srcData = getSrcDataAtPortAs(0); auto* dstData = getDstDataAtPortAs(0); + const auto& cpu_parallel = context->getCpuParallel(); std::vector batchLenghts; if (idcesPrecision == ov::element::i32) { @@ -133,7 +133,7 @@ void Ngram::execute([[maybe_unused]] const dnnl::stream& strm) { 2. Apply sliding window of windowSize with a step windowStride and form k new embedding vectors for the embedding */ memset(dstData, 0, numOutElems * sizeof(float)); - parallel_for(batchLenghts.size() - 1, [&](const size_t batchIdx) { + cpu_parallel->parallel_for(batchLenghts.size() - 1, [&](const size_t batchIdx) { size_t srcWindowBias = 0; size_t dstWindowBias = 0; diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp index 9e4f4c39519ea7..d86158e755cf95 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp @@ -364,6 +364,7 @@ void NonMaxSuppression::nmsWithSoftSigma(const float* boxes, const VectorDims& boxesStrides, const VectorDims& scoresStrides, std::vector& filtBoxes) { + const auto& cpu_parallel = context->getCpuParallel(); auto less = [](const boxInfo& l, const boxInfo& r) { return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx)); }; @@ -378,7 +379,7 @@ void NonMaxSuppression::nmsWithSoftSigma(const float* boxes, return std::exp(m_scale * iou * iou); }; - parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) { + cpu_parallel->parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) { std::vector selectedBoxes; const float* boxesPtr = boxes + batch_idx * boxesStrides[0]; const float* scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; @@ -516,8 +517,9 @@ void NonMaxSuppression::nmsWithoutSoftSigma(const float* boxes, const VectorDims& boxesStrides, const VectorDims& scoresStrides, std::vector& filtBoxes) { + const auto& cpu_parallel = context->getCpuParallel(); auto max_out_box = static_cast(m_output_boxes_per_class); - parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) { + cpu_parallel->parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) { const float* boxesPtr = boxes + batch_idx * boxesStrides[0]; const float* scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; @@ -862,7 +864,8 @@ void NonMaxSuppression::nmsRotated(const float* boxes, const VectorDims& scores_strides, std::vector& filtered_boxes) { CPU_NODE_ASSERT(!m_jit_kernel, "does not have implementation of the JIT kernel for Rotated boxes."); - parallel_for2d(m_batches_num, m_classes_num, [&](int64_t batch_idx, int64_t class_idx) { + const auto& cpu_parallel = context->getCpuParallel(); + cpu_parallel->parallel_for2d(m_batches_num, m_classes_num, [&](int64_t batch_idx, int64_t class_idx) { const float* boxes_ptr = boxes + batch_idx * boxes_strides[0]; const float* scores_ptr = scores + batch_idx * scores_strides[0] + class_idx * scores_strides[1]; diff --git a/src/plugins/intel_cpu/src/nodes/one_hot.cpp b/src/plugins/intel_cpu/src/nodes/one_hot.cpp index 8a84ee3b53363a..253693539bf6ae 100644 --- a/src/plugins/intel_cpu/src/nodes/one_hot.cpp +++ b/src/plugins/intel_cpu/src/nodes/one_hot.cpp @@ -19,7 +19,6 @@ #include "openvino/cc/selective_build.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/shape.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" @@ -127,6 +126,7 @@ void OneHot::initSupportedPrimitiveDescriptors() { template void OneHot::one_hot(size_t prefix_size, size_t suffix_size) { + const auto& cpu_parallel = context->getCpuParallel(); const auto* src_data = getSrcDataAtPortAs(0); auto* dst_data = getDstDataAtPortAs(0); @@ -139,7 +139,7 @@ void OneHot::one_hot(size_t prefix_size, size_t suffix_size) { // set on_value at needed locations auto on_val = on_value; - parallel_for(prefix_size, [&](std::size_t prefix_idx) { + cpu_parallel->parallel_for(prefix_size, [&](std::size_t prefix_idx) { const in_type* src_dataPtr = &src_data[prefix_idx * suffix_size]; out_type* dst_dataPtr = &dst_data[prefix_idx * depth * suffix_size]; for (std::size_t suffix_idx = 0; suffix_idx < suffix_size; ++suffix_idx, ++src_dataPtr, ++dst_dataPtr) { diff --git a/src/plugins/intel_cpu/src/nodes/priorbox.cpp b/src/plugins/intel_cpu/src/nodes/priorbox.cpp index 37d4edfee3969f..2e2bfd6d202e0c 100644 --- a/src/plugins/intel_cpu/src/nodes/priorbox.cpp +++ b/src/plugins/intel_cpu/src/nodes/priorbox.cpp @@ -21,7 +21,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/prior_box.hpp" @@ -152,6 +151,7 @@ void PriorBox::createPrimitive() { } void PriorBox::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); const int* in_data = getSrcDataAtPortAs(0); const int H = in_data[0]; const int W = in_data[1]; @@ -316,18 +316,18 @@ void PriorBox::execute([[maybe_unused]] const dnnl::stream& strm) { } if (clip) { - parallel_for((H * W * number_of_priors * 4), [&](size_t i) { + cpu_parallel->parallel_for((H * W * number_of_priors * 4), [&](size_t i) { dst_data[i] = (std::min)((std::max)(dst_data[i], 0.0F), 1.0F); }); } uint64_t channel_size = OH * OW; if (variance.size() == 1) { - parallel_for(channel_size, [&](size_t i) { + cpu_parallel->parallel_for(channel_size, [&](size_t i) { dst_data[i + channel_size] = variance[0]; }); } else { - parallel_for(H * W * number_of_priors, [&](size_t i) { + cpu_parallel->parallel_for(H * W * number_of_priors, [&](size_t i) { for (size_t j = 0; j < 4; ++j) { dst_data[i * 4 + j + channel_size] = variance[j]; } diff --git a/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp b/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp index d4b8c47b178f87..5954de0e990978 100644 --- a/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp +++ b/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp @@ -20,7 +20,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/prior_box_clustered.hpp" @@ -107,6 +106,7 @@ void PriorBoxClustered::createPrimitive() { } void PriorBoxClustered::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); const int* in_data = getSrcDataAtPortAs(0); const int layer_height = in_data[0]; const int layer_width = in_data[1]; @@ -126,7 +126,7 @@ void PriorBoxClustered::execute([[maybe_unused]] const dnnl::stream& strm) { const auto& out_shape = getChildEdgeAt(0)->getMemory().getShape().getStaticDims(); size_t var_size = variances.size(); - parallel_for2d(layer_height, layer_width, [&](int64_t h, int64_t w) { + cpu_parallel->parallel_for2d(layer_height, layer_width, [&](int64_t h, int64_t w) { float center_x = (static_cast(w) + offset) * step_w; float center_y = (static_cast(h) + offset) * step_h; diff --git a/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp b/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp index b5b1224f4d57c9..3e230e9adb237c 100644 --- a/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp @@ -24,7 +24,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/deformable_psroi_pooling.hpp" @@ -277,6 +276,7 @@ void PSROIPooling::executeAverage(const inputType* srcData, const int roiBatchInd, const BlockedMemoryDesc& srcDesc, const BlockedMemoryDesc& dstDesc) { + const auto& cpu_parallel = context->getCpuParallel(); int inBlockSize = 0; int outBlockSize = 0; int outBlockCount = 0; @@ -338,7 +338,7 @@ void PSROIPooling::executeAverage(const inputType* srcData, } }; if (srcDesc.hasLayoutType(LayoutType::nspc)) { - parallel_for2d(nh, nw, [&](int h, int w) { + cpu_parallel->parallel_for2d(nh, nw, [&](int h, int w) { const int binOffsetOutput = n * nc * nh * nw; const int binOffsetInput = roiBatchInd * channels * height * width; for (int c = 0; c < nc; c++) { @@ -347,7 +347,7 @@ void PSROIPooling::executeAverage(const inputType* srcData, } }); } else if (srcDesc.hasLayoutType(LayoutType::ncsp)) { - parallel_for3d(nc, nh, nw, [&](int c, int h, int w) { + cpu_parallel->parallel_for3d(nc, nh, nw, [&](int c, int h, int w) { const int gc = (c * groupSize + h) * groupSize + w; const int outputBlockResidual = (dstDesc.hasLayoutType(LayoutType::ncsp) ? 0 : c % inBlockSize); const int outputBlockIdx = (c / outBlockSize) * outBlockSize; @@ -356,7 +356,7 @@ void PSROIPooling::executeAverage(const inputType* srcData, avgPsroi(c, h, w, 0, outputBlockResidual, binOffsetInput, binOffsetOutput); }); } else { // nChw16c, nChw8c - parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) { + cpu_parallel->parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) { int cStart = blkIdx * outBlockSize; int cEnd = (blkIdx == outBlockCount - 1 ? nc : cStart + outBlockSize); for (int c = cStart; c < cEnd; c++) { @@ -381,6 +381,7 @@ void PSROIPooling::executeBilinear(const inputType* srcData, const int roiBatchInd, const BlockedMemoryDesc& srcDesc, const BlockedMemoryDesc& dstDesc) { + const auto& cpu_parallel = context->getCpuParallel(); int inBlockSize = 0; int outBlockSize = 0; int outBlockCount = 0; @@ -488,17 +489,17 @@ void PSROIPooling::executeBilinear(const inputType* srcData, if (srcDesc.hasLayoutType(LayoutType::nspc)) { const int binOffsetOutput = currentRoi * nc * nh * nw; - parallel_for2d(nh, nw, [&](int h, int w) { + cpu_parallel->parallel_for2d(nh, nw, [&](int h, int w) { for (int c = 0; c < nc; c++) { bilinearPsroi(c, h, w, 0, binOffsetOutput + c); } }); } else if (srcDesc.hasLayoutType(LayoutType::ncsp)) { - parallel_for3d(nc, nh, nw, [&](int c, int h, int w) { + cpu_parallel->parallel_for3d(nc, nh, nw, [&](int c, int h, int w) { bilinearPsroi(c, h, w, 0, (currentRoi * outputChannelsPadding + c) * binCount); }); } else { // nChw16c, nChw8c - parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) { + cpu_parallel->parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) { int cStart = blkIdx * outBlockSize; int cEnd = (blkIdx == outBlockCount - 1 ? nc : cStart + outBlockSize); for (int c = cStart; c < cEnd; c++) { @@ -523,6 +524,7 @@ void PSROIPooling::executeBilinearDeformable(const inputType* srcData, const int channelsEachClass, const int currentRoi, const int roiBatchInd) { + const auto& cpu_parallel = context->getCpuParallel(); const float roiStartW = round(bottomRois[1]) * spatialScale - 0.5F; const float roiStartH = round(bottomRois[2]) * spatialScale - 0.5F; const float roiEndW = (round(bottomRois[3]) + 1.0F) * spatialScale - 0.5F; @@ -530,7 +532,7 @@ void PSROIPooling::executeBilinearDeformable(const inputType* srcData, // Force too small ROIs to be 1x1 const float roiWidth = std::max(roiEndW - roiStartW, 0.1F); // avoid 0 const float roiHeight = std::max(roiEndH - roiStartH, 0.1F); - parallel_for3d(nc, nh, nw, [&](int c, int h, int w) { + cpu_parallel->parallel_for3d(nc, nh, nw, [&](int c, int h, int w) { size_t dstIndex = ((currentRoi * nc + c) * nh + h) * nw + w; dstData[dstIndex] = 0; // Compute w and h at bottom @@ -587,6 +589,7 @@ void PSROIPooling::executeBilinearDeformable(const inputType* srcData, template void PSROIPooling::executeSpecified() { + const auto& cpu_parallel = context->getCpuParallel(); const auto* srcData = getSrcDataAtPortAs(0); const auto* bottomRoisBeginning = getSrcDataAtPortAs(1); auto* dstData = getDstDataAtPortAs(0); @@ -613,7 +616,7 @@ void PSROIPooling::executeSpecified() { channelsEachClass /= numClasses; } - parallel_for(realRois, [&](int currentRoi) { + cpu_parallel->parallel_for(realRois, [&](int currentRoi) { const float* bottomRois = bottomRoisBeginning + currentRoi * 5; auto roiBatchInd = static_cast(bottomRois[0]); if (getAlgorithm() == Algorithm::PSROIPoolingAverage) { diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index 422c5ed2619e0c..67ac320e267e03 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -2551,13 +2551,14 @@ void Reduce::reduce_type(const uint8_t* in_ptr, uint8_t* out_ptr) { } void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { + const auto& cpu_parallel = context->getCpuParallel(); output_info_reassign(&out_ptr); init_dst_data(out_ptr, dst_size); if (ReduceN && !ReduceC && !ReduceD && !ReduceH && !ReduceW) { size_t IA = IC * ID * IH * IW; reduce_stride = IA; - parallel_for(IA / blk_size, [&](size_t iba) { + cpu_parallel->parallel_for(IA / blk_size, [&](size_t iba) { size_t oba = iba; reduce_kernel_process(in_ptr + iba * blk_size * src_data_size, out_ptr + oba * blk_size * dst_data_size, @@ -2587,7 +2588,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { for (size_t i = 0; i < blk_size; i++) { index_buf[i] = i * work_amount * src_data_size; } - parallel_for(IK, [&](size_t ik) { + cpu_parallel->parallel_for(IK, [&](size_t ik) { size_t ok = ik; reduce_kernel_process(in_ptr_n + ik * blk_size * inner_size * src_data_size, out_ptr_n + ok * blk_size * output_inner_size * dst_data_size, @@ -2598,7 +2599,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { }); size_t tail_start = IK * blk_size; size_t IT = outer_size - tail_start; - parallel_for(IT, [&](size_t it) { + cpu_parallel->parallel_for(IT, [&](size_t it) { size_t ot = it; reduce_kernel_process(in_ptr_n + (tail_start + it) * inner_size * src_data_size, out_ptr_n + (tail_start + ot) * output_inner_size * dst_data_size, @@ -2607,14 +2608,14 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { }); } else { if (ReduceH) { - parallel_for2d(IC, ID, [&](size_t ic, size_t id) { + cpu_parallel->parallel_for2d(IC, ID, [&](size_t ic, size_t id) { size_t oc = ic; size_t od = id; GET_PTR_NCD_BASE_PTR_N_PLN; reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, work_amount, 1); }); } else { - parallel_for3d(IC, ID, IH, [&](size_t ic, size_t id, size_t ih) { + cpu_parallel->parallel_for3d(IC, ID, IH, [&](size_t ic, size_t id, size_t ih) { size_t oc = ic; size_t od = id; GET_PTR_NCD_BASE_PTR_N_PLN; @@ -2642,7 +2643,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { init_dst_data(prc_ptr_n, prc_size); size_t IS = IH * IW; reduce_stride = IS; - parallel_for(IS / blk_size, [&](size_t ibs) { + cpu_parallel->parallel_for(IS / blk_size, [&](size_t ibs) { size_t pbs = ibs; reduce_kernel_process(in_ptr_n + ibs * blk_size * src_data_size, prc_ptr_n + pbs * blk_size * prc_data_size, @@ -2658,7 +2659,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { IC * ID); // step2: ReduceW reduce_kernel_reassign(); - parallel_for(PH, [&](size_t ph) { + cpu_parallel->parallel_for(PH, [&](size_t ph) { size_t oh = ph; reduce_kernel_process(prc_ptr_n + ph * PW * prc_data_size, out_ptr_n + oh * OW * dst_data_size, @@ -2673,7 +2674,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { for (size_t id = 0; id < ID; id++) { size_t od = ReduceD ? 0 : id; GET_PTR_NCD_PLN; - parallel_for(IH, [&](size_t ih) { + cpu_parallel->parallel_for(IH, [&](size_t ih) { size_t oh = ih; GET_PTR_NCDH_PLN; reduce_kernel_process(in_ptr_ncdh, out_ptr_ncdh, IW, 1); @@ -2696,11 +2697,11 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { } } } else if (!ReduceC && !ReduceD && ReduceH && !ReduceW) { - parallel_for2d(IC, ID, [&](size_t ic, size_t id) { + cpu_parallel->parallel_for2d(IC, ID, [&](size_t ic, size_t id) { size_t oc = ic; size_t od = id; GET_PTR_NCD_BASE_PTR_N_PLN; - parallel_for(IW / blk_size, [&](size_t ibw) { + cpu_parallel->parallel_for(IW / blk_size, [&](size_t ibw) { size_t obw = ibw; reduce_kernel_process(in_ptr_ncd + ibw * blk_size * src_data_size, out_ptr_ncd + obw * blk_size * dst_data_size, @@ -2723,7 +2724,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { // step1: !ReduceD && ReduceH && !ReduceW uint8_t* prc_ptr_n = vec_reduceDH_prc.data(); init_dst_data(prc_ptr_n, prc_size); - parallel_for2d(ID, IWB, [&](size_t id, size_t iwb) { + cpu_parallel->parallel_for2d(ID, IWB, [&](size_t id, size_t iwb) { size_t pd = id; size_t pwb = iwb; reduce_kernel_process(in_ptr_n + (id * IH * IW + iwb * blk_size) * src_data_size, @@ -2735,7 +2736,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { // step2: ReduceD reduce_stride = PW; reduce_kernel_reassign(); - parallel_for(IWB, [&](size_t iwb) { + cpu_parallel->parallel_for(IWB, [&](size_t iwb) { size_t pwb = iwb; size_t owb = iwb; reduce_kernel_process(prc_ptr_n + pwb * blk_size * prc_data_size, @@ -2749,7 +2750,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { // reduce tail reduce_stride = IW; size_t tail_start = IWB * blk_size; - parallel_for(IW - tail_start, [&](size_t i_tail) { + cpu_parallel->parallel_for(IW - tail_start, [&](size_t i_tail) { reduce_kernel_process(in_ptr_n + (tail_start + i_tail) * src_data_size, out_ptr_n + (tail_start + i_tail) * dst_data_size, 1, @@ -2757,10 +2758,10 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { ID * IH); }); } else { - parallel_for(IC, [&](size_t ic) { + cpu_parallel->parallel_for(IC, [&](size_t ic) { size_t oc = ic; GET_PTR_NC_PLN; - parallel_for(IWB, [&](size_t iwb) { + cpu_parallel->parallel_for(IWB, [&](size_t iwb) { size_t owb = iwb; reduce_kernel_process(in_ptr_nc + iwb * blk_size * src_data_size, out_ptr_nc + owb * blk_size * dst_data_size, @@ -2769,7 +2770,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { ID * IH); }); size_t tail_start = IWB * blk_size; - parallel_for(IW - tail_start, [&](size_t i_tail) { + cpu_parallel->parallel_for(IW - tail_start, [&](size_t i_tail) { reduce_kernel_process(in_ptr_nc + (tail_start + i_tail) * src_data_size, out_ptr_nc + (tail_start + i_tail) * dst_data_size, 1, @@ -2779,7 +2780,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { }); } } else if (ReduceC && ReduceD && ReduceH && !ReduceW) { - parallel_for(IW / blk_size, [&](size_t ibw) { + cpu_parallel->parallel_for(IW / blk_size, [&](size_t ibw) { size_t obw = ibw; reduce_kernel_process(in_ptr_n + ibw * blk_size * src_data_size, out_ptr_n + obw * blk_size * dst_data_size, @@ -2797,7 +2798,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { } else if (ReduceC && !ReduceD && !ReduceH && !ReduceW) { size_t IS = ID * IH * IW; reduce_stride = IS; - parallel_for(IS / blk_size, [&](size_t ibs) { + cpu_parallel->parallel_for(IS / blk_size, [&](size_t ibs) { size_t obs = ibs; reduce_kernel_process(in_ptr_n + ibs * blk_size * src_data_size, out_ptr_n + obs * blk_size * dst_data_size, @@ -2846,6 +2847,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) { } void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) { + const auto& cpu_parallel = context->getCpuParallel(); size_t ICB = div_up(IC, blk_size); size_t OCB = div_up(OC, blk_size); output_info_reassign(&out_ptr); @@ -2859,7 +2861,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) { apply_division = getAlgorithm() == Algorithm::ReduceMean && attr.get()->post_ops_.len() == 0; apply_post_kernel = !apply_division; } - parallel_for2d(ICB, ID, [&](size_t icb, size_t id) { + cpu_parallel->parallel_for2d(ICB, ID, [&](size_t icb, size_t id) { size_t ocb = icb; size_t od = id; GET_PTR_NCD_BASE_PTR_N_BLK; @@ -2874,7 +2876,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) { init_dst_data(vec_prc.data(), prc_size); uint8_t* out_ptr_n_cp = out_ptr_n; out_ptr_n = vec_prc.data(); - parallel_for(ICB, [&](size_t icb) { + cpu_parallel->parallel_for(ICB, [&](size_t icb) { size_t ocb = icb; GET_PTR_NC_BLK; reduce_kernel_process(in_ptr_nc, out_ptr_nc, ID * IH * IW * blk_size); @@ -2902,7 +2904,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) { } } else if (ReduceC && !ReduceD && !ReduceH && !ReduceW) { reduce_stride = ID * IH * IW * blk_size; - parallel_for3d(ID, IH, IW, [&](size_t id, size_t ih, size_t iw) { + cpu_parallel->parallel_for3d(ID, IH, IW, [&](size_t id, size_t ih, size_t iw) { size_t icb = 0; size_t ocb = 0; GET_PTR_NC_BLK; @@ -2924,7 +2926,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) { for (size_t ih = 0; ih < IH; ih++) { size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_BLK; - parallel_for(IW, [&](size_t iw) { + cpu_parallel->parallel_for(IW, [&](size_t iw) { size_t ow = iw; GET_PTR_NCDHW_BLK; reduce_kernel_process(in_ptr_ncdhw, out_ptr_ncdhw, blk_size); @@ -2942,6 +2944,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) { } void Reduce::reduce_BLK_concern_padding(const uint8_t* in_ptr, uint8_t* out_ptr) { + const auto& cpu_parallel = context->getCpuParallel(); size_t ICB = div_up(IC, blk_size); size_t OCB = div_up(OC, blk_size); output_info_reassign(&out_ptr); @@ -2968,7 +2971,7 @@ void Reduce::reduce_BLK_concern_padding(const uint8_t* in_ptr, uint8_t* out_ptr) size_t ocb = 0; ; size_t ic = icb * blk_size; - parallel_for(ID, [&](size_t id) { + cpu_parallel->parallel_for(ID, [&](size_t id) { size_t od = id; GET_PTR_NCD_BASE_PTR_N_BLK; if (ic + blk_size <= IC) { @@ -3024,7 +3027,7 @@ void Reduce::reduce_BLK_concern_padding(const uint8_t* in_ptr, uint8_t* out_ptr) for (size_t ih = 0; ih < IH; ih++) { size_t oh = ReduceH ? 0 : ih; GET_PTR_NCDH_BLK; - parallel_for(IW, [&](size_t iw) { + cpu_parallel->parallel_for(IW, [&](size_t iw) { size_t ow = iw; GET_PTR_NCDHW_BLK; reduce_kernel_process(in_ptr_ncdhw, out_ptr_ncdhw, blk_size); @@ -3064,11 +3067,12 @@ inline void Reduce::reduce_kernel_process(const uint8_t* in_p, } inline void Reduce::reduce_kernel_post_process(uint8_t* out_ptr) { + const auto& cpu_parallel = context->getCpuParallel(); const uint8_t* in_ptr = fuse_low_precision ? static_cast(intermediate_buf.data()) : nullptr; const size_t integerDivisor = empty_input ? 1 : IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW); const auto divisor = static_cast(integerDivisor); if (layout == ReduceLayoutType::reduce_ncsp) { - parallel_for2d(OB, OC, [&](size_t ob, size_t oc) { + cpu_parallel->parallel_for2d(OB, OC, [&](size_t ob, size_t oc) { const uint8_t* in_p = in_ptr + (ob * OC + oc) * OD * OH * OW * intermediate_data_size; uint8_t* out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size; auto arg = jit_reduce_post_call_args(); @@ -3107,7 +3111,7 @@ inline void Reduce::reduce_kernel_post_process(uint8_t* out_ptr) { }); } else { size_t OCB = div_up(OC, blk_size); - parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) { + cpu_parallel->parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) { const uint8_t* in_p = in_ptr + (ob * OCB + ocb) * OD * OH * OW * blk_size * intermediate_data_size; uint8_t* out_p = out_ptr + (ob * OCB + ocb) * OD * OH * OW * blk_size * dst_data_size; auto arg = jit_reduce_post_call_args(); @@ -3171,11 +3175,12 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { const size_t DIM4 = OH; const size_t stride1 = DIM2 * DIM3 * DIM4; const size_t stride0 = stride1 * DIM1; + const auto& cpu_parallel = context->getCpuParallel(); if (dst_data_size == 4) { const auto* src_data = reinterpret_cast(proc_ptr); auto* dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { auto src_off = b * stride0 + j * DIM1; auto dst_off = b * stride0 + j; for (size_t dim1 = 0; dim1 < DIM1; dim1++) { @@ -3187,7 +3192,7 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { } else if (dst_data_size == 2) { const auto* src_data = reinterpret_cast(proc_ptr); auto* dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { auto src_off = b * stride0 + j * DIM1; auto dst_off = b * stride0 + j; for (size_t dim1 = 0; dim1 < DIM1; dim1++) { @@ -3199,7 +3204,7 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { } else { const auto* src_data = proc_ptr; auto* dst_data = out_ptr; - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { auto src_off = b * stride0 + j * DIM1; auto dst_off = b * stride0 + j; for (size_t dim1 = 0; dim1 < DIM1; dim1++) { @@ -3212,6 +3217,7 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { } void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { + const auto& cpu_parallel = context->getCpuParallel(); const size_t DIM0 = OB; const size_t DIM1 = OC; const size_t DIM2 = OD; @@ -3224,7 +3230,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { if (dst_data_size == 4) { const auto* src_data = reinterpret_cast(proc_ptr); auto* dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { auto src_off = b * src_stride0 + j * blk_size; auto dst_off = b * dst_stride0 + j; for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) { @@ -3245,7 +3251,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { } else if (dst_data_size == 2) { const auto* src_data = reinterpret_cast(proc_ptr); auto* dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { auto src_off = b * src_stride0 + j * blk_size; auto dst_off = b * dst_stride0 + j; for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) { @@ -3266,7 +3272,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { } else { const auto* src_data = proc_ptr; auto* dst_data = out_ptr; - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { auto src_off = b * src_stride0 + j * blk_size; auto dst_off = b * dst_stride0 + j; for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) { @@ -3288,6 +3294,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const { } inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) { + const auto& cpu_parallel = context->getCpuParallel(); switch (algorithm) { case Algorithm::ReduceL1: case Algorithm::ReduceL2: @@ -3303,32 +3310,32 @@ inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) { case Algorithm::ReduceProd: if (output_prec == ov::element::f32) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } else if (output_prec == ov::element::i32) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } else if (output_prec == ov::element::bf16) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } else if (output_prec == ov::element::f16) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } else if (output_prec == ov::element::u8) { auto* out_p = out_ptr; - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } else if (output_prec == ov::element::i8) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } @@ -3336,32 +3343,32 @@ inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) { case Algorithm::ReduceMax: if (output_prec == ov::element::f32) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::lowest(); }); } else if (output_prec == ov::element::i32) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); } else if (output_prec == ov::element::bf16) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::lowest(); }); } else if (output_prec == ov::element::f16) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::lowest(); }); } else if (output_prec == ov::element::u8) { auto* out_p = out_ptr; - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); } else if (output_prec == ov::element::i8) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); } @@ -3369,32 +3376,32 @@ inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) { case Algorithm::ReduceMin: if (output_prec == ov::element::f32) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } else if (output_prec == ov::element::i32) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } else if (output_prec == ov::element::bf16) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } else if (output_prec == ov::element::f16) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } else if (output_prec == ov::element::u8) { auto* out_p = out_ptr; - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } else if (output_prec == ov::element::i8) { auto* out_p = reinterpret_cast(out_ptr); - parallel_for(dst_size / dst_data_size, [&](size_t i) { + cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } @@ -3710,6 +3717,7 @@ void Reduce::reduce_ref_process(const float* in_ptr, } inline void Reduce::reduce_ref_map(float* out_ptr, size_t work_amount_dst, size_t reduced_dims_work_amount) { + const auto& cpu_parallel = context->getCpuParallel(); switch (algorithm) { case Algorithm::ReduceAnd: case Algorithm::ReduceL1: @@ -3721,18 +3729,18 @@ inline void Reduce::reduce_ref_map(float* out_ptr, size_t work_amount_dst, size_ case Algorithm::ReduceSumSquare: break; case Algorithm::ReduceL2: - parallel_for(work_amount_dst, [&](size_t i) { + cpu_parallel->parallel_for(work_amount_dst, [&](size_t i) { out_ptr[i] = std::sqrt(out_ptr[i]); }); break; case Algorithm::ReduceLogSum: case Algorithm::ReduceLogSumExp: - parallel_for(work_amount_dst, [&](size_t i) { + cpu_parallel->parallel_for(work_amount_dst, [&](size_t i) { out_ptr[i] = logf(out_ptr[i]); }); break; case Algorithm::ReduceMean: - parallel_for(work_amount_dst, [&](size_t i) { + cpu_parallel->parallel_for(work_amount_dst, [&](size_t i) { out_ptr[i] /= reduced_dims_work_amount; }); break; diff --git a/src/plugins/intel_cpu/src/nodes/region_yolo.cpp b/src/plugins/intel_cpu/src/nodes/region_yolo.cpp index b295b4e93615ae..22bd04af6c9492 100644 --- a/src/plugins/intel_cpu/src/nodes/region_yolo.cpp +++ b/src/plugins/intel_cpu/src/nodes/region_yolo.cpp @@ -24,7 +24,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/region_yolo.hpp" @@ -392,10 +391,11 @@ inline float RegionYolo::logistic_scalar(float src) { } inline void RegionYolo::calculate_logistic(size_t start_index, int count, uint8_t* dst_data) { + const auto& cpu_parallel = context->getCpuParallel(); auto dst_data_size = output_prec.size(); if (logistic_kernel) { int blocks_num = div_up(count, block_size); - parallel_for(blocks_num, [&](int ib) { + cpu_parallel->parallel_for(blocks_num, [&](int ib) { int idx = ib * block_size; int work_amount = std::min(count - idx, block_size); diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 7a275171d08447..99e4a705069529 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -38,8 +38,8 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type/element_type.hpp" +#include "thread_pool_imp.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" @@ -353,6 +353,7 @@ bool Reorder::created() const { } void Reorder::optimizedNcsp2Nspc() { + const auto& cpu_parallel = context->getCpuParallel(); auto parentEdge = getParentEdgeAt(0); auto childEdge = getChildEdgeAt(0); @@ -374,7 +375,7 @@ void Reorder::optimizedNcsp2Nspc() { const size_t stride1 = DIM2 * DIM3 * DIM4; const size_t stride2 = DIM2 * DIM3; - parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) { + cpu_parallel->parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) { size_t src_off = dim0 * src_batch_stride + j * DIM4 + dim1 * stride1; size_t dst_off = dim0 * dst_batch_stride + j * DIM4 * dst_channel_stride + dim1; @@ -387,6 +388,7 @@ void Reorder::optimizedNcsp2Nspc() { } void Reorder::optimizedNspc2Ncsp() { + const auto& cpu_parallel = context->getCpuParallel(); auto parentEdge = getParentEdgeAt(0); auto childEdge = getChildEdgeAt(0); @@ -405,7 +407,7 @@ void Reorder::optimizedNspc2Ncsp() { const size_t block_size = DIM2 * DIM3 * DIM4; const size_t src_batch_stride = block_size * DIM1; const size_t dst_batch_stride = dstStrides[0]; - parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) { auto src_off = b * src_batch_stride + j * DIM1; auto dst_off = b * dst_batch_stride + j; for (size_t dim1 = 0; dim1 < DIM1; ++dim1) { @@ -468,7 +470,10 @@ std::string Reorder::getReorderArgs(const MemoryDesc& parentDesc, const MemoryDe return inArgs + "_" + outArgs; } -void Reorder::reorderData(const IMemory& input, const IMemory& output, const MultiCachePtr& cache) { +void Reorder::reorderData(const IMemory& input, + const IMemory& output, + const MultiCachePtr& cache, + const std::shared_ptr& threadPool) { OPENVINO_ASSERT(input.getDesc().isDefined() && output.getDesc().isDefined(), "Can't reorder data with dynamic shapes"); @@ -541,7 +546,7 @@ void Reorder::reorderData(const IMemory& input, const IMemory& output, const Mul output.getDesc().serializeFormat()); } if (reorder) { - dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order); + dnnl::stream loc_stream = make_stream(engine, threadPool); reorder.execute(loc_stream, {{DNNL_ARG_FROM, srcMemory}, {DNNL_ARG_TO, dstMemory}}); } else { OPENVINO_THROW("Could not make onednn reorder."); diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index 6c444d41eb7fb8..477278a4b15a51 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -71,7 +71,10 @@ class Reorder : public Node { static std::string getReorderArgs(const MemoryDesc& parentDesc, const MemoryDesc& childDesc); - static void reorderData(const IMemory& input, const IMemory& output, const MultiCachePtr& cache = nullptr); + static void reorderData(const IMemory& input, + const IMemory& output, + const MultiCachePtr& cache = nullptr, + const std::shared_ptr& threadPool = nullptr); private: dnnl::reorder::primitive prim; diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp index 74bead6998c3ae..410c98b549984a 100644 --- a/src/plugins/intel_cpu/src/nodes/rnn.cpp +++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp @@ -40,7 +40,6 @@ #include "openvino/core/coordinate_diff.hpp" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/type/element_type_traits.hpp" @@ -924,6 +923,7 @@ void RNN::fillSequenceDesc() { template void RNN::fillWeights() { using DataType = typename element_type_traits::value_type; + const auto& cpu_parallel = context->getCpuParallel(); CPU_NODE_ASSERT(getParentEdgeAt(wIdx)->getParent()->getType() == Type::Input, "expects Constant for port ", wIdx); auto w_const_blob = static_cast(getParentEdgeAt(wIdx)->getParent().get())->getMemoryPtr(); CPU_NODE_ASSERT(getParentEdgeAt(rIdx)->getParent()->getType() == Type::Input, "expects Constant for port ", rIdx); @@ -956,7 +956,7 @@ void RNN::fillWeights() { const uint64_t step = SC * G; const uint64_t SC_DC = SC * DC; - parallel_for2d(G, SC, [&](size_t g, size_t out_i) { + cpu_parallel->parallel_for2d(G, SC, [&](size_t g, size_t out_i) { DataType* l_w_ptr = w_ptr + m_gate_map[g] * SC + out_i; DataType* s_w_ptr = ie_w_ptr + out_i * DC + g * SC_DC; for (size_t in_i = 0; in_i < DC; in_i++) { @@ -992,7 +992,7 @@ void RNN::fillWeights() { const uint64_t step = SC * G; const uint64_t SC_2 = SC * SC; - parallel_for2d(G, SC, [&](size_t g, size_t out_i) { + cpu_parallel->parallel_for2d(G, SC, [&](size_t g, size_t out_i) { DataType* l_r_ptr = r_ptr + m_gate_map[g] * SC + out_i; DataType* s_r_ptr = ie_r_ptr + out_i * SC + g * SC_2; for (size_t in_i = 0; in_i < SC; in_i++) { @@ -1024,6 +1024,7 @@ void RNN::fillWeights() { template void RNN::fillBiases() { using DataType = typename element_type_traits::value_type; + const auto& cpu_parallel = context->getCpuParallel(); CPU_NODE_ASSERT(getParentEdgeAt(bIdx)->getParent()->getType() == Type::Input, "expects Constant for port ", bIdx); auto b_const_blob = static_cast(getParentEdgeAt(bIdx)->getParent().get())->getMemoryPtr(); @@ -1062,7 +1063,7 @@ void RNN::fillBiases() { } const uint64_t step = SC * sizeof(DataType); - parallel_for(Gb, [&](size_t g) { + cpu_parallel->parallel_for(Gb, [&](size_t g) { DataType* l_b_ptr = b_ptr + m_gate_map[g] * SC; const DataType* l_ie_b_ptr = ie_b_ptr + g * SC; cpu_memcpy(l_b_ptr, l_ie_b_ptr, step); @@ -1087,7 +1088,10 @@ void RNN::prepareMemory(const DnnlMemoryDescPtr& new_desc, size_t idx) { auto create = [&]() { Memory memory{getEngine(), m_initial_weights[idx]->getDescPtr(), m_initial_weights[idx]->getData()}; MemoryPtr res_ptr = std::make_shared(getEngine(), new_desc); - node::Reorder::reorderData(memory, *res_ptr, context->getParamsCache()); + node::Reorder::reorderData(memory, + *res_ptr, + context->getParamsCache(), + context->getCpuParallel()->get_thread_pool()); return res_ptr; }; diff --git a/src/plugins/intel_cpu/src/nodes/roi_align.cpp b/src/plugins/intel_cpu/src/nodes/roi_align.cpp index 911294f8c03df6..f9c063dd8be291 100644 --- a/src/plugins/intel_cpu/src/nodes/roi_align.cpp +++ b/src/plugins/intel_cpu/src/nodes/roi_align.cpp @@ -922,6 +922,7 @@ void ROIAlign::execute([[maybe_unused]] const dnnl::stream& strm) { template void ROIAlign::executeSpecified() { + const auto& cpu_parallel = context->getCpuParallel(); const auto& srcMemory0 = getParentEdgeAt(0)->getMemory(); const auto& srcMemory1 = getParentEdgeAt(1)->getMemory(); const auto& dstMemory = getChildEdgeAt(0)->getMemory(); @@ -993,7 +994,7 @@ void ROIAlign::executeSpecified() { } } - parallel_for(realRois, [&](size_t n) { + cpu_parallel->parallel_for(realRois, [&](size_t n) { int roiOff = n * 4; const float* srcRoiPtr = &srcRoi[roiOff]; int roiBatchInd = srcRoiIdx[n]; @@ -1165,7 +1166,7 @@ void ROIAlign::executeSpecified() { }); } else { // one lane for one sample generation, then pooling all samples. - parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) { + cpu_parallel->parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) { size_t batchSrcOffset = srcRoiIdx[n] * batchInputStride; size_t channelSrcOffset = batchSrcOffset + cIdx * H * W; size_t binOffset = yBinInd * pooledW + xBinInd; @@ -1187,7 +1188,7 @@ void ROIAlign::executeSpecified() { } } else { // ref with planar - parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) { + cpu_parallel->parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) { int numSamplesROI = numSamples[n]; size_t batchSrcOffset = srcRoiIdx[n] * batchInputStride; size_t channelSrcOffset = batchSrcOffset + cIdx * H * W; diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp index c0dcf1b74aca65..ba0c226affb979 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp @@ -1918,6 +1918,7 @@ void ScaledDotProductAttention::updateBeamTable(const MemoryPtr& mem_beam_idx, s // Update pastkv using cur_k, cur_v, simply append cur_k, cur_v to the end of pastkv in the state. void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const MemoryPtr& mem_cur_v) { + const auto& cpu_parallel = context->getCpuParallel(); // L, B, H, S -> [2, 0, 1, 3] -> B, H, L, S std::vector order = {0, 1, 2, 3}; if (!m_config.config.permute_axes.empty()) { @@ -2019,14 +2020,14 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M [&](const SDPAQuantParam& quant_param, PlainTensor& new_scale_zp, PlainTensor& old_scale_zp) { if (quant_param.isByChannel) { size_t group_nums = div_up(L0, quant_param.groupSize) * 2; - parallel_for(group_nums, [&](size_t m) { + cpu_parallel->parallel_for(group_nums, [&](size_t m) { memcpy(new_scale_zp.ptr(m), old_scale_zp.ptr(m), sizeof(float) * old_scale_zp.m_dims[1] * old_scale_zp.m_dims[2] * old_scale_zp.m_dims[3]); }); } else { - parallel_for(L0, [&](size_t m) { + cpu_parallel->parallel_for(L0, [&](size_t m) { memcpy(new_scale_zp.ptr(m), old_scale_zp.ptr(m), sizeof(float) * old_scale_zp.m_dims[1] * old_scale_zp.m_dims[2] * diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp index 85195237af4317..811b076a283239 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp @@ -985,6 +985,7 @@ void ScatterUpdate::execute([[maybe_unused]] const dnnl::stream& strm) { // and indices tensor of shape [i_0, i_1, ..., i_k]. // Updates tensor shape should be [d_0, d_1, ... d_(axis - 1), i_0, i_1, ..., i_k, d_(axis + 1), ..., d_n]. void ScatterUpdate::scatterUpdate(uint8_t* indices, uint8_t* update, int axis, uint8_t* dstData) { + const auto& cpu_parallel = context->getCpuParallel(); const auto& srcDataDim = getParentEdgeAt(DATA_ID)->getMemory().getStaticDims(); const auto& indicesDim = getParentEdgeAt(INDICES_ID)->getMemory().getStaticDims(); const auto& updateDim = getParentEdgeAt(UPDATE_ID)->getMemory().getStaticDims(); @@ -1006,7 +1007,7 @@ void ScatterUpdate::scatterUpdate(uint8_t* indices, uint8_t* update, int axis, u size_t blockToUpdate = srcBlockND[axis + 1]; size_t blockToUpdateSize = blockToUpdate * dataSize; - parallel_for2d(batchToUpdate, idxLength, [&](size_t b, size_t idx) { + cpu_parallel->parallel_for2d(batchToUpdate, idxLength, [&](size_t b, size_t idx) { int64_t idxValue = getIndicesValue(indices, idx); uint8_t* dstEntry = dstData + (b * srcBlockND[axis] + idxValue * blockToUpdate) * dataSize; uint8_t* updateEntry = update + (b * updateBlockND[axis] + idx * blockToUpdate) * dataSize; @@ -1038,6 +1039,7 @@ void ScatterUpdate::scatterNDUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates, [[maybe_unused]] const scatter_reductions::ReduceNone& kernel) { + const auto& cpu_parallel = context->getCpuParallel(); auto* indices = mem_indices->getDataAs(); auto* update = mem_updates->getDataAs(); auto* dstData = mem_data->getDataAs(); @@ -1055,7 +1057,7 @@ void ScatterUpdate::scatterNDUpdate(const MemoryPtr& mem_data, } size_t sizeToUpdate = srcBlockND[k] * dataSize; - parallel_for(idxTupleNum, [&](size_t tupleIdx) { + cpu_parallel->parallel_for(idxTupleNum, [&](size_t tupleIdx) { size_t indicesOffset = tupleIdx * k; size_t dstOffset = 0; for (size_t i = 0; i < k; i++) { diff --git a/src/plugins/intel_cpu/src/nodes/split.cpp b/src/plugins/intel_cpu/src/nodes/split.cpp index e8cbc83a930eb4..23ebaaf3a52b1d 100644 --- a/src/plugins/intel_cpu/src/nodes/split.cpp +++ b/src/plugins/intel_cpu/src/nodes/split.cpp @@ -459,6 +459,7 @@ void Split::selectOptimalPrimitiveDescriptor() { } void Split::optimizedNspc2Ncsp(size_t MB) { + const auto& cpu_parallel = context->getCpuParallel(); auto parentEdge = getParentEdgeAt(0); const int rank = parentEdge->getMemory().getShape().getRank(); const auto parentDims = parentEdge->getMemory().getStaticDims(); @@ -490,7 +491,7 @@ void Split::optimizedNspc2Ncsp(size_t MB) { const size_t OC = dims[1]; const size_t strideOB = OC * strideOC; - parallel_for2d(MB, DHW, [&](size_t b, size_t j) { + cpu_parallel->parallel_for2d(MB, DHW, [&](size_t b, size_t j) { const auto* localSrcPtr = srcPtr + b * strideIB + j * strideIW; auto* localDstPtr = dstData + b * strideOB + j * dataSize; for (size_t c = 0; c < OC; c++) { diff --git a/src/plugins/intel_cpu/src/nodes/stft.cpp b/src/plugins/intel_cpu/src/nodes/stft.cpp index ec6545fbf60319..7ecd710cc1fff1 100644 --- a/src/plugins/intel_cpu/src/nodes/stft.cpp +++ b/src/plugins/intel_cpu/src/nodes/stft.cpp @@ -116,6 +116,7 @@ void transpose_out4d(const uint8_t* in, } // namespace void STFT::execute([[maybe_unused]] const dnnl::stream& strm) { + const auto& cpu_parallel = context->getCpuParallel(); const auto* signal = getSrcDataAtPortAs(DATA_IDX); const auto* window = getSrcDataAtPortAs(WINDOW_IDX); auto* rdft_result = getDstDataAtPortAs(0); @@ -147,7 +148,7 @@ void STFT::execute([[maybe_unused]] const dnnl::stream& strm) { dst = dst_mem->getDataAs(); } - parallel_for2d(batch_size, num_frames, [&](size_t batch, size_t frame_idx) { + cpu_parallel->parallel_for2d(batch_size, num_frames, [&](size_t batch, size_t frame_idx) { size_t batch_in_start = batch * signal_length; size_t batch_frames_out = batch * num_frames; diff --git a/src/plugins/intel_cpu/src/nodes/topk.cpp b/src/plugins/intel_cpu/src/nodes/topk.cpp index 95f4bf3a4976a4..c2ad4d958998e3 100644 --- a/src/plugins/intel_cpu/src/nodes/topk.cpp +++ b/src/plugins/intel_cpu/src/nodes/topk.cpp @@ -30,7 +30,6 @@ #include "onednn/iml_type_mapper.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/constant.hpp" @@ -2248,6 +2247,7 @@ void TopK::execute([[maybe_unused]] const dnnl::stream& strm) { } void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_idx_ptr) { + const auto& cpu_parallel = context->getCpuParallel(); uint8_t* process_ptr = vec_process_ptr.data(); uint8_t* process_idx_ptr = vec_process_idx_ptr.data(); @@ -2256,7 +2256,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id size_t IA = div_up(src_dims[1], blk_size); size_t OA = div_up(dst_dims[1], blk_size); if (algorithm == TopKAlgorithm::topk_bubble_sort) { - parallel_for2d(O, I, [&](size_t o, size_t i) { + cpu_parallel->parallel_for2d(O, I, [&](size_t o, size_t i) { const uint8_t* in_ptr_a = in_ptr + (o * IA * I + i) * blk_size * data_size; uint8_t* out_ptr_a = out_ptr + (o * OA * I + i) * blk_size * data_size; uint8_t* out_idx_ptr_a = out_idx_ptr + (o * OA * I + i) * blk_size * sizeof(int32_t); @@ -2264,7 +2264,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id topk_kernel_process(in_ptr_a, out_ptr_a, out_idx_ptr_a, nullptr, nullptr, work_amount); }); } else if (algorithm == TopKAlgorithm::topk_bitonic_sort) { - parallel_for(O, [&](size_t o) { + cpu_parallel->parallel_for(O, [&](size_t o) { const uint8_t* in_ptr_a = in_ptr + o * IA * I * blk_size * data_size; uint8_t* process_ptr_a = process_ptr + o * IA * I * blk_size * data_size; uint8_t* process_idx_ptr_a = process_idx_ptr + o * IA * I * blk_size * sizeof(int32_t); @@ -2275,7 +2275,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id }); } } else { // [planar layout] [blocked layout with topk on non-C] - parallel_for2d(O, I / blk_size, [&](size_t o, size_t k) { + cpu_parallel->parallel_for2d(O, I / blk_size, [&](size_t o, size_t k) { const uint8_t* in_ptr_a = in_ptr + (o * A * I + k * blk_size) * data_size; uint8_t* process_ptr_a = process_ptr + (o * A * I + k * blk_size) * data_size; uint8_t* process_idx_ptr_a = process_idx_ptr + (o * A * I + k * blk_size) * sizeof(int32_t); @@ -2288,7 +2288,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id size_t tail_start = I / blk_size * blk_size; size_t work_amount = I - tail_start; if (work_amount) { - parallel_for(O, [&](size_t o) { + cpu_parallel->parallel_for(O, [&](size_t o) { const uint8_t* in_ptr_a = in_ptr + (o * A * I + tail_start) * data_size; uint8_t* process_ptr_a = process_ptr + (o * A * I + tail_start) * data_size; uint8_t* process_idx_ptr_a = process_idx_ptr + (o * A * I + tail_start) * sizeof(int32_t); @@ -2486,9 +2486,10 @@ void TopK::topk_ref_process(const float* src_data, int32_t* dst_idx, const VectorDims& in_dims, std::function compare) const { + const auto& cpu_parallel = context->getCpuParallel(); int after_num = count(in_dims, axis + 1, in_dims.size()); - parallel_for2d(before_num, after_num, [&](int i0, int i1) { + cpu_parallel->parallel_for2d(before_num, after_num, [&](int i0, int i1) { std::vector max_values(top_k + 1); std::vector max_indexes(top_k + 1); int s_index = i0 * dim * after_num + i1; diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp index dfb18e7ae1f181..6eaef4cbd4e3f8 100644 --- a/src/plugins/intel_cpu/src/nodes/unique.cpp +++ b/src/plugins/intel_cpu/src/nodes/unique.cpp @@ -26,7 +26,6 @@ #include "openvino/cc/selective_build.h" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" -#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "selective_build.h" @@ -297,6 +296,7 @@ void Unique::flattenTensorExec() { template void Unique::slicedTensorExec() { + const auto& cpu_parallel = context->getCpuParallel(); auto inDataMemPtr = getSrcMemoryAtPort(IN_DATA); const auto* srcDataPtr = inDataMemPtr->getDataAs(); int* firstTmpPtr = nullptr; @@ -400,7 +400,7 @@ void Unique::slicedTensorExec() { const auto dstOuterStep = innerLen * uniqueLen; // Filling of the first output if needed. if (sorted || definedOutputs[UNIQUE_DATA]) { - parallel_for(uniqueLen, [&](size_t u) { + cpu_parallel->parallel_for(uniqueLen, [&](size_t u) { const auto* first1 = srcDataPtr + uniqIdx[u] * innerLen; auto first2 = dstDataPtr + u * innerLen; for (int64_t p = 0LU; p < outerLen; p++) { @@ -449,7 +449,7 @@ void Unique::slicedTensorExec() { }); // Permutation - parallel_for2d(outerLen, uniqueLen, [&](int64_t ot, size_t u) { + cpu_parallel->parallel_for2d(outerLen, uniqueLen, [&](int64_t ot, size_t u) { auto src = dst1 + ot * dstOuterStep + colToSort[u].idx * innerLen; auto dst = dst2 + ot * dstOuterStep + u * innerLen; @@ -457,7 +457,7 @@ void Unique::slicedTensorExec() { }); if (defined3outputs) { - parallel_for(uniqueLen, [&](size_t u) { + cpu_parallel->parallel_for(uniqueLen, [&](size_t u) { if (definedOutputs[FIRST_UNIQUE_IDX]) { first1[u] = first2[colToSort[u].idx]; } diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index e867f84fef18b3..625c9fc560f894 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -272,27 +272,39 @@ void Plugin::get_performance_streams(Config& config, const std::shared_ptr& model, bool imported) { - const auto model_prefer_name = std::string("MODEL_PREFER_THREADS"); + std::vector model_prefer_name = {std::string("MODEL_PREFER_THREADS_LATENCY"), + std::string("MODEL_PREFER_THREADS_THROUGHPUT"), + std::string("TBB_PARTITIONER")}; if (imported && model->has_rt_info("intel_cpu_hints_config")) { - // load model_prefer_threads from cache - int cache_model_prefer = 0; + // load model_prefer_threads and tbbPartitioner from cache const auto& hints_config = model->get_rt_info("intel_cpu_hints_config"); - const auto it_model_prefer = hints_config.find(model_prefer_name); - if (it_model_prefer != hints_config.end()) { - try { - cache_model_prefer = it_model_prefer->second.as(); - } catch (const ov::Exception&) { - OPENVINO_THROW("Cache file doesn't have valid value for " + model_prefer_name); + for (auto& one_name : model_prefer_name) { + auto it_model_prefer = hints_config.find(one_name); + if (it_model_prefer != hints_config.end()) { + try { + if (one_name == std::string("TBB_PARTITIONER")) { + conf.tbbPartitioner = it_model_prefer->second.as(); + } else if (one_name == std::string("MODEL_PREFER_THREADS_LATENCY")) { + conf.modelPreferThreadsLatency = it_model_prefer->second.as(); + } else { + conf.modelPreferThreadsThroughput = it_model_prefer->second.as(); + } + } catch (const ov::Exception&) { + OPENVINO_THROW("Cache file doesn't have valid value for " + one_name); + } } - - conf.modelPreferThreads = cache_model_prefer; } + conf.modelPreferThreads = 0; } get_performance_streams(conf, model); // save model_prefer_threads to model rt_info when loading network if (!imported) { ov::AnyMap hints_props; - hints_props.insert({model_prefer_name, std::to_string(conf.modelPreferThreads)}); + hints_props.insert({model_prefer_name[0], std::to_string(conf.modelPreferThreadsLatency)}); + hints_props.insert({model_prefer_name[1], std::to_string(conf.modelPreferThreadsThroughput)}); + std::stringstream tbb_partitioner; + tbb_partitioner << conf.tbbPartitioner; + hints_props.insert({model_prefer_name[2], tbb_partitioner.str()}); model->set_rt_info(hints_props, "intel_cpu_hints_config"); } } @@ -544,6 +556,7 @@ ov::Any Plugin::get_ro_property(const std::string& name, [[maybe_unused]] const RO_property(ov::device::architecture.name()), }; // the whole config is RW before model is loaded. + std::vector rwProperties{RW_property(ov::num_streams.name()), RW_property(ov::inference_num_threads.name()), RW_property(ov::enable_profiling.name()), @@ -561,6 +574,7 @@ ov::Any Plugin::get_ro_property(const std::string& name, [[maybe_unused]] const RW_property(ov::log::level.name()), RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RW_property(ov::intel_cpu::enable_tensor_parallel.name()), + RW_property(ov::intel_cpu::tbb_partitioner.name()), RW_property(ov::hint::dynamic_quantization_group_size.name()), RW_property(ov::hint::kv_cache_precision.name()), RW_property(ov::key_cache_precision.name()), @@ -638,6 +652,9 @@ ov::Any Plugin::get_ro_property(const std::string& name, [[maybe_unused]] const if (name == ov::intel_cpu::enable_tensor_parallel) { return static_cast(engConfig.enableTensorParallel); } + if (name == ov::intel_cpu::tbb_partitioner) { + return static_cast(engConfig.tbbPartitioner); + } if (name == ov::execution_devices) { return decltype(ov::execution_devices)::value_type{get_device_name()}; } diff --git a/src/plugins/intel_cpu/src/thread_pool_imp.cpp b/src/plugins/intel_cpu/src/thread_pool_imp.cpp new file mode 100644 index 00000000000000..c612deff37f5e0 --- /dev/null +++ b/src/plugins/intel_cpu/src/thread_pool_imp.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "thread_pool_imp.hpp" + +#include +#include +#include +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE +# include + +# include "cpu_parallel.hpp" +# include "openvino/core/parallel.hpp" +# include "openvino/runtime/intel_cpu/properties.hpp" +#endif + +namespace ov::intel_cpu { + +dnnl::stream make_stream(const dnnl::engine& engine, const std::shared_ptr& thread_pool) { // NOLINT +#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE + static auto g_cpu_parallel = std::make_shared(ov::intel_cpu::TbbPartitioner::STATIC); + auto stream = dnnl::threadpool_interop::make_stream( + engine, + thread_pool ? thread_pool.get() : g_cpu_parallel->get_thread_pool().get()); +#else + auto stream = dnnl::stream(engine); +#endif + return stream; +} + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/thread_pool_imp.hpp b/src/plugins/intel_cpu/src/thread_pool_imp.hpp new file mode 100644 index 00000000000000..686d6bb91312fb --- /dev/null +++ b/src/plugins/intel_cpu/src/thread_pool_imp.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "cpu_parallel.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" + +namespace ov::intel_cpu { + +class ThreadPool : public dnnl::threadpool_interop::threadpool_iface { +public: + ThreadPool() = delete; + ThreadPool(ThreadPool&) = delete; + ThreadPool& operator=(ThreadPool&) = delete; + ThreadPool(ThreadPool&&) = delete; + ThreadPool& operator=(ThreadPool&&) = delete; + + explicit ThreadPool(const CpuParallel& cpu_parallel) : m_cpu_parallel(cpu_parallel) {} + + [[nodiscard]] int get_num_threads() const override { + return m_cpu_parallel.get_num_threads(); + } + [[nodiscard]] bool get_in_parallel() const override { + return false; + } + [[nodiscard]] uint64_t get_flags() const override { + return 0; + } + void parallel_for(int n, const std::function& fn) override { + m_cpu_parallel.parallel_simple(n, fn); + } + +private: + const CpuParallel& m_cpu_parallel; +}; + +dnnl::stream make_stream(const dnnl::engine& engine, const std::shared_ptr& thread_pool = nullptr); + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp index b1ad52580d4789..e766ff254aa132 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp @@ -104,7 +104,8 @@ bool RepackMatMulWeights::run_on_model(const std::shared_ptr& model) eng, m_context->getParamsCache(), m_context->getWeightsCache(), - nullptr); + nullptr, + m_context->getCpuParallel()->get_thread_pool()); weights_idxs.insert(i); } diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index b78ef0204bb19a..afb0c4728c1577 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -48,6 +48,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable RO_property(ov::log::level.name()), RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::intel_cpu::enable_tensor_parallel.name()), + RO_property(ov::intel_cpu::tbb_partitioner.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), RO_property(ov::key_cache_precision.name()), diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp index bb786f3828becc..39a30f394074ca 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp @@ -61,6 +61,7 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) { RW_property(ov::log::level.name()), RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RW_property(ov::intel_cpu::enable_tensor_parallel.name()), + RW_property(ov::intel_cpu::tbb_partitioner.name()), RW_property(ov::hint::dynamic_quantization_group_size.name()), RW_property(ov::hint::kv_cache_precision.name()), RW_property(ov::key_cache_precision.name()), diff --git a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp index 7864dd2426c8cf..3d927e71afce8b 100644 --- a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp @@ -16,6 +16,7 @@ #include "common_test_utils/common_utils.hpp" #include "memory_control.hpp" #include "nodes/input.h" +#include "thread_pool_imp.hpp" using namespace ov::intel_cpu; namespace ReorderCPUTest { @@ -139,7 +140,7 @@ class ReorderCPUTestGraph { n->initSupportedPrimitiveDescriptors(); n->selectPrimitiveDescriptorByIndex(0); } - stream = dnnl::stream{cpuEngine}; + stream = ov::intel_cpu::make_stream(cpuEngine, context->getCpuParallel()->get_thread_pool()); } protected: diff --git a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt index d32a2b0088d502..24bf14639919b2 100644 --- a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt @@ -49,7 +49,11 @@ function(ov_add_onednn) set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "" FORCE) set(DNNL_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) set(DNNL_BUILD_TESTS OFF CACHE BOOL "" FORCE) - set(DNNL_CPU_RUNTIME "${THREADING}" CACHE STRING "" FORCE) + if("${THREADING}" STREQUAL "TBB_ADAPTIVE") + set(DNNL_CPU_RUNTIME "THREADPOOL" CACHE STRING "" FORCE) + else() + set(DNNL_CPU_RUNTIME "${THREADING}" CACHE STRING "" FORCE) + endif() set(DNNL_GPU_RUNTIME "NONE" CACHE STRING "" FORCE) set(DNNL_BLAS_VENDOR "NONE" CACHE STRING "" FORCE) set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "SSE41" CACHE STRING "" FORCE) diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 0cad963300cd2b..6e4715d2d6f635 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 0cad963300cd2b80c371cb66d435c60ad0e5edd7 +Subproject commit 6e4715d2d6f635991eeb67e1ad584e94542c9f23