diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index 7aa4ea74e7ac07..7475a5585842ae 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -71,7 +71,7 @@ unset(_ov_download_tbb_done CACHE)
 # or ENABLE_SYSTEM_TBB is OFF
 #
 function(ov_download_tbb)
-    if(_ov_download_tbb_done OR NOT THREADING MATCHES "^(TBB|TBB_AUTO)$")
+    if(_ov_download_tbb_done OR NOT THREADING MATCHES "^(TBB|TBB_AUTO|TBB_ADAPTIVE)$")
         return()
     endif()
     set(_ov_download_tbb_done ON CACHE INTERNAL "Whether prebuilt TBB is already downloaded")
diff --git a/cmake/features.cmake b/cmake/features.cmake
index ebeaca5d3df7de..89313647cb703f 100644
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@@ -78,10 +78,15 @@ ov_dependent_option (ENABLE_PKGCONFIG_GEN "Enable openvino.pc pkg-config file ge
 # OpenVINO Runtime specific options
 #
 
-# "OneDNN library based on OMP or TBB or Sequential implementation: TBB|OMP|SEQ"
-set(THREADING_DEFAULT "TBB")
+# "OneDNN library based on OMP or TBB or Sequential implementation: TBB|OMP|SEQ|TBB_ADAPTIVE"
+if(AARCH64)
+    set(THREADING_DEFAULT "TBB")
+else()
+    set(THREADING_DEFAULT "TBB_ADAPTIVE")
+endif()
+
 
-set(THREADING_OPTIONS "TBB" "TBB_AUTO" "SEQ" "OMP")
+set(THREADING_OPTIONS "TBB" "TBB_AUTO" "SEQ" "OMP" "TBB_ADAPTIVE")
 
 set(THREADING "${THREADING_DEFAULT}" CACHE STRING "Threading")
 set_property(CACHE THREADING PROPERTY STRINGS ${THREADING_OPTIONS})
@@ -99,7 +104,7 @@ endif()
 
 ov_dependent_option (ENABLE_INTEL_OPENMP "Enables usage of Intel OpenMP instead of default compiler one" ${ENABLE_INTEL_OPENMP_DEFAULT} "THREADING STREQUAL OMP" OFF)
 
-if((THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO") AND
+if((THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE") AND
     (BUILD_SHARED_LIBS OR (LINUX AND X86_64)))
     set(ENABLE_TBBBIND_2_5_DEFAULT ON)
 else()
diff --git a/cmake/templates/OpenVINOConfig.cmake.in b/cmake/templates/OpenVINOConfig.cmake.in
index 448ac9017e4ad9..7c0dd41aefd7fb 100644
--- a/cmake/templates/OpenVINOConfig.cmake.in
+++ b/cmake/templates/OpenVINOConfig.cmake.in
@@ -169,7 +169,7 @@ endmacro()
 
 macro(_ov_find_tbb)
     set(_ov_threading "@THREADING@")
-    if(_ov_threading STREQUAL "TBB" OR _ov_threading STREQUAL "TBB_AUTO")
+    if(_ov_threading STREQUAL "TBB" OR _ov_threading STREQUAL "TBB_AUTO" OR _ov_threading STREQUAL "TBB_ADAPTIVE")
         set(enable_pkgconfig_tbb "@tbb_FOUND@")
 
         # try tbb.pc
@@ -563,7 +563,7 @@ if(_ov_as_external_package)
 
     # WA for cmake version < 3.16 which does not export
     # IMPORTED_LINK_DEPENDENT_LIBRARIES_** properties if no PUBLIC dependencies for the library
-    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO")
+    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE")
         foreach(type RELEASE DEBUG RELWITHDEBINFO MINSIZEREL)
             foreach(tbb_target TBB::tbb TBB::tbbmalloc PkgConfig::tbb)
                 if(TARGET ${tbb_target})
diff --git a/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py b/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py
index cd3e8d953a4395..3cb0f76e552681 100644
--- a/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py
+++ b/src/bindings/python/src/openvino/properties/intel_cpu/__init__.py
@@ -2,6 +2,9 @@
 # Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+# Enums
+from openvino._pyopenvino.properties.intel_cpu import TbbPartitioner
+
 # Properties
 import openvino._pyopenvino.properties.intel_cpu as __intel_cpu
 from openvino.properties._properties import __make_properties
diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
index 5d194946bde609..81960cdefcbaa4 100644
--- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
+++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -107,11 +107,16 @@ void regmodule_properties(py::module m) {
     py::module m_intel_cpu =
         m_properties.def_submodule("intel_cpu", "openvino.properties.intel_cpu submodule that simulates ov::intel_cpu");
 
+    py::enum_<ov::intel_cpu::TbbPartitioner>(m_intel_cpu, "TbbPartitioner", py::arithmetic())
+        .value("STATIC", ov::intel_cpu::TbbPartitioner::STATIC)
+        .value("AUTO", ov::intel_cpu::TbbPartitioner::AUTO);
+
     // Submodule intel_cpu property
     wrap_property_RW(m_intel_cpu, ov::intel_cpu::denormals_optimization, "denormals_optimization");
     wrap_property_RW(m_intel_cpu,
                      ov::intel_cpu::sparse_weights_decompression_rate,
                      "sparse_weights_decompression_rate");
+    wrap_property_RW(m_intel_cpu, ov::intel_cpu::tbb_partitioner, "tbb_partitioner");
 
     // Submodule intel_gpu
     py::module m_intel_gpu =
diff --git a/src/bindings/python/src/pyopenvino/utils/utils.cpp b/src/bindings/python/src/pyopenvino/utils/utils.cpp
index dde59a79a8041b..8a233317a9df94 100644
--- a/src/bindings/python/src/pyopenvino/utils/utils.cpp
+++ b/src/bindings/python/src/pyopenvino/utils/utils.cpp
@@ -18,6 +18,7 @@
 #include "openvino/core/meta_data.hpp"
 #include "openvino/frontend/decoder.hpp"
 #include "openvino/frontend/graph_iterator.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
 #include "openvino/runtime/properties.hpp"
 
 using Version = ov::pass::Serialize::Version;
@@ -245,6 +246,8 @@ py::object from_ov_any(const ov::Any& any) {
         return py::cast(any.as<ov::hint::ExecutionMode>());
     } else if (any.is<ov::log::Level>()) {
         return py::cast(any.as<ov::log::Level>());
+    } else if (any.is<ov::intel_cpu::TbbPartitioner>()) {
+        return py::cast(any.as<ov::intel_cpu::TbbPartitioner>());
     } else if (any.is<ov::device::Type>()) {
         return py::cast(any.as<ov::device::Type>());
     } else if (any.is<ov::streams::Num>()) {
@@ -544,6 +547,8 @@ ov::Any py_object_to_any(const py::object& py_obj) {
         return py::cast<ov::hint::ExecutionMode>(py_obj);
     } else if (py::isinstance<ov::log::Level>(py_obj)) {
         return py::cast<ov::log::Level>(py_obj);
+    } else if (py::isinstance<ov::intel_cpu::TbbPartitioner>(py_obj)) {
+        return py::cast<ov::intel_cpu::TbbPartitioner>(py_obj);
     } else if (py::isinstance<ov::device::Type>(py_obj)) {
         return py::cast<ov::device::Type>(py_obj);
     } else if (py::isinstance<ov::streams::Num>(py_obj)) {
diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
index 266c8690b78231..61f08acef3ccea 100644
--- a/src/bindings/python/tests/test_runtime/test_properties.py
+++ b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -116,6 +116,13 @@ def test_properties_rw_base():
                 (log.Level.TRACE, "Level.TRACE", 4),
             ),
         ),
+        (
+            intel_cpu.TbbPartitioner,
+            (
+                (intel_cpu.TbbPartitioner.STATIC, "TbbPartitioner.STATIC", 1),
+                (intel_cpu.TbbPartitioner.AUTO, "TbbPartitioner.AUTO", 2),
+            ),
+        ),
         (
             intel_auto.SchedulePolicy,
             (
@@ -367,6 +374,14 @@ def test_properties_ro(ov_property_ro, expected_value):
                 (2.0, 2.0),
             ),
         ),
+        (
+            intel_cpu.tbb_partitioner,
+            "TBB_PARTITIONER",
+            (
+                (intel_cpu.TbbPartitioner.STATIC, intel_cpu.TbbPartitioner.STATIC),
+                (intel_cpu.TbbPartitioner.AUTO, intel_cpu.TbbPartitioner.AUTO),
+            ),
+        ),
         (
             intel_auto.device_bind_buffer,
             "DEVICE_BIND_BUFFER",
diff --git a/src/cmake/install_tbb.cmake b/src/cmake/install_tbb.cmake
index dc126165ba77c4..a9a909b6d02d3c 100644
--- a/src/cmake/install_tbb.cmake
+++ b/src/cmake/install_tbb.cmake
@@ -66,7 +66,7 @@ unset(_ov_dynamic_tbbbind_2_5_found)
 # install TBB
 
 # define variables for OpenVINOConfig.cmake
-if(THREADING MATCHES "^(TBB|TBB_AUTO)$")
+if(THREADING MATCHES "^(TBB|TBB_AUTO|TBB_ADAPTIVE)$")
     set(OV_TBB_DIR "${TBB_DIR}")
     list(APPEND PATH_VARS "OV_TBB_DIR")
 endif()
@@ -80,7 +80,7 @@ endif()
 # - downloaded TBB should be a part of all packages
 # - custom TBB provided by users, needs to be a part of wheel packages
 # - system TBB also needs to be a part of wheel packages
-if(THREADING MATCHES "^(TBB|TBB_AUTO)$" AND
+if(THREADING MATCHES "^(TBB|TBB_AUTO|TBB_ADAPTIVE)$" AND
        ( (DEFINED TBBROOT AND TBBROOT MATCHES ${TEMP}) OR
          (DEFINED TBBROOT OR DEFINED TBB_DIR OR DEFINED ENV{TBBROOT} OR
           DEFINED ENV{TBB_DIR}) OR ENABLE_SYSTEM_TBB ) )
diff --git a/src/cmake/ov_parallel.cmake b/src/cmake/ov_parallel.cmake
index cdb29b0aa37868..509d243a0da542 100644
--- a/src/cmake/ov_parallel.cmake
+++ b/src/cmake/ov_parallel.cmake
@@ -76,7 +76,7 @@ function(_ov_get_tbb_location tbb_target _tbb_lib_location_var)
 endfunction()
 
 macro(ov_find_package_tbb)
-    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" AND NOT TBB_FOUND)
+    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE" AND NOT TBB_FOUND)
         # conan generates TBBConfig.cmake files, which follows cmake's
         # SameMajorVersion scheme, while TBB itself follows AnyNewerVersion one
         # see https://cmake.org/cmake/help/latest/module/CMakePackageConfigHelpers.html#generating-a-package-version-file
@@ -340,7 +340,7 @@ macro(ov_find_package_openmp)
 endmacro()
 
 function(ov_set_threading_interface_for TARGET_NAME)
-    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" AND NOT TBB_FOUND)
+    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE" AND NOT TBB_FOUND)
         # find TBB
         ov_find_package_tbb()
 
@@ -383,9 +383,13 @@ function(ov_set_threading_interface_for TARGET_NAME)
         add_library(openvino::threading ALIAS openvino_threading)
     endif()
 
-    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO")
+    if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR THREADING STREQUAL "TBB_ADAPTIVE")
         if(TBB_FOUND)
-            set(_ov_thread_define "OV_THREAD_TBB")
+            if(THREADING STREQUAL "TBB_ADAPTIVE")
+                set(_ov_thread_define "OV_THREAD_TBB_ADAPTIVE")
+            else()
+                set(_ov_thread_define "OV_THREAD_TBB")
+            endif()
             set(_ov_threading_lib TBB::tbb)
         else()
             set(THREADING "SEQ" PARENT_SCOPE)
diff --git a/src/core/include/openvino/core/parallel.hpp b/src/core/include/openvino/core/parallel.hpp
index 38cb05ff9b6072..2fe54c2286704b 100644
--- a/src/core/include/openvino/core/parallel.hpp
+++ b/src/core/include/openvino/core/parallel.hpp
@@ -17,12 +17,14 @@
 #include <cstddef>
 #include <type_traits>
 
-#define OV_THREAD_TBB      0
-#define OV_THREAD_OMP      1
-#define OV_THREAD_SEQ      2
-#define OV_THREAD_TBB_AUTO 3
-
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#define OV_THREAD_TBB          0
+#define OV_THREAD_OMP          1
+#define OV_THREAD_SEQ          2
+#define OV_THREAD_TBB_AUTO     3
+#define OV_THREAD_TBB_ADAPTIVE 4
+
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
+#    define OV_THREAD_USE_TBB 1
 #    ifndef NOMINMAX
 #        define NOMINMAX
 #    endif
@@ -66,7 +68,7 @@ inline int parallel_get_env_threads() {
 inline void parallel_set_max_nested_levels(int levels) {
     return;
 }
-#    if OV_THREAD == OV_THREAD_TBB
+#    if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
 #        define PARTITIONING , tbb::static_partitioner()
 
 // The TBB version less than 2018u1 has no static_partitioner argument for
@@ -81,6 +83,7 @@ inline void parallel_set_max_nested_levels(int levels) {
 #        define PARTITIONING
 #    endif
 #elif OV_THREAD == OV_THREAD_OMP
+#    define OV_THREAD_USE_TBB 0
 #    include <omp.h>
 #    if !defined(_OPENMP)
 #        error Undefined OpenMP version.
@@ -162,6 +165,7 @@ inline int parallel_get_nested_level() {
 }
 
 #elif OV_THREAD == OV_THREAD_SEQ
+#    define OV_THREAD_USE_TBB 0
 #    include <algorithm>
 inline int parallel_get_env_threads() {
     return 1;
@@ -231,7 +235,7 @@ namespace ov {
 
 template <typename F>
 void parallel_nt(int nthr, const F& func) {
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     if (nthr == 0)
         nthr = parallel_get_max_threads();
     if (nthr == 1) {
@@ -279,7 +283,7 @@ void parallel_nt_static(int nthr, const F& func) {
 
     if (nthr == 0)
         nthr = parallel_get_max_threads();
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     tbb::parallel_for(
         0,
         nthr,
@@ -305,7 +309,7 @@ void parallel_nt_static(int nthr, const F& func) {
 
 template <typename I, typename F>
 void parallel_sort(I begin, I end, const F& comparator) {
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     tbb::parallel_sort(begin, end, comparator);
 #elif OV_THREAD == OV_THREAD_OMP
     // TODO: propose OpenMP version
@@ -317,7 +321,7 @@ void parallel_sort(I begin, I end, const F& comparator) {
 
 template <typename T0, typename R, typename F>
 R parallel_sum(const T0& D0, const R& input, const F& func) {
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     return _TBB_REDUCE_FUNC(
         tbb::blocked_range<T0>(0, D0),
         input,
@@ -351,7 +355,7 @@ R parallel_sum(const T0& D0, const R& input, const F& func) {
 
 template <typename T0, typename T1, typename R, typename F>
 R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) {
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     return _TBB_REDUCE_FUNC(
         tbb::blocked_range2d<T0, T1>(0, D0, 0, D1),
         input,
@@ -391,7 +395,7 @@ R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) {
 }
 template <typename T0, typename T1, typename T2, typename R, typename F>
 R parallel_sum3d(const T0& D0, const T1& D1, const T2& D2, const R& input, const F& func) {
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     return _TBB_REDUCE_FUNC(
         tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2),
         input,
@@ -524,7 +528,7 @@ void parallel_for(const T0& D0, const F& func) {
     if (D0 == T0(0)) {
         return;
     }
-#if OV_THREAD == OV_THREAD_TBB
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
     auto work_amount = static_cast<size_t>(D0);
     int nthr = parallel_get_max_threads();
     if (static_cast<size_t>(nthr) > work_amount)
@@ -590,7 +594,7 @@ void parallel_for2d(const T0& D0, const T1& D1, const F& func) {
     if (D0 == T0(0) || D1 == T1(0)) {
         return;
     }
-#if OV_THREAD == OV_THREAD_TBB
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
     auto work_amount = static_cast<size_t>(D0 * D1);
     int nthr = parallel_get_max_threads();
     if (static_cast<size_t>(nthr) > work_amount)
@@ -636,7 +640,7 @@ void parallel_for2d(const T0& D0, const T1& D1, const F& func) {
 
 template <typename T0, typename T1, typename F>
 void parallel_for2d_dynamic(const T0& D0, const T1& D1, const F& func) {
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     tbb::parallel_for(tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), [=](const tbb::blocked_range2d<T0, T1>& r) {
         for (T0 d0 = r.rows().begin(); d0 < r.rows().end(); d0++) {
             for (T1 d1 = r.cols().begin(); d1 < r.cols().end(); d1++) {
@@ -674,7 +678,7 @@ void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) {
     if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0)) {
         return;
     }
-#if OV_THREAD == OV_THREAD_TBB
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
     auto work_amount = static_cast<size_t>(D0 * D1 * D2);
     int nthr = parallel_get_max_threads();
     if (static_cast<size_t>(nthr) > work_amount)
@@ -720,7 +724,7 @@ void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) {
 
 template <typename T0, typename T1, typename T2, typename F>
 void parallel_for3d_dynamic(const T0& D0, const T1& D1, const T2& D2, const F& func) {
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     tbb::parallel_for(tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2),
                       [=](const tbb::blocked_range3d<T0, T1, T2>& r) {
                           for (T0 d0 = r.pages().begin(); d0 < r.pages().end(); d0++) {
@@ -762,7 +766,7 @@ void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons
     if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0) || D3 == T3(0)) {
         return;
     }
-#if OV_THREAD == OV_THREAD_TBB
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
     auto work_amount = static_cast<size_t>(D0 * D1 * D2 * D3);
     int nthr = parallel_get_max_threads();
     if (static_cast<size_t>(nthr) > work_amount)
@@ -838,7 +842,7 @@ void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons
     if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0) || D3 == T3(0) || D4 == T4(0)) {
         return;
     }
-#if OV_THREAD == OV_THREAD_TBB
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
     auto work_amount = static_cast<size_t>(D0 * D1 * D2 * D3 * D4);
     int nthr = parallel_get_max_threads();
     if (static_cast<size_t>(nthr) > work_amount)
@@ -916,7 +920,7 @@ void parallel_for6d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons
     if (D0 == T0(0) || D1 == T1(0) || D2 == T2(0) || D3 == T3(0) || D4 == T4(0) || D5 == T5(0)) {
         return;
     }
-#if OV_THREAD == OV_THREAD_TBB
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)
     auto work_amount = static_cast<size_t>(D0 * D1 * D2 * D3 * D4 * D5);
     int nthr = parallel_get_max_threads();
     if (static_cast<size_t>(nthr) > work_amount)
diff --git a/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp b/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp
index f57da5f2f5900a..9319c9dc43dff6 100644
--- a/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp
+++ b/src/inference/dev_api/openvino/runtime/performance_heuristics.hpp
@@ -23,7 +23,15 @@ struct MemBandwidthPressure {
     float ratio_mem_limited_convs = 0;
     float ratio_mem_limited_deconvs = 0;
     float ratio_mem_limited_gemms = 0;
+    float ratio_mem_limited_adds = 0;
     float ratio_compute_deconvs = 0;
+    int total_gemms = 0;
+    int total_convs = 0;
+    int total_adds = 0;
+    int total_light_gemms = 0;
+    int total_light_convs = 0;
+    int total_heavy_convs = 0;
+    int total_nodes = 0;
 
     static constexpr float UNKNOWN = FLT_MAX;
     static constexpr float ALL = 1.0f;
diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
index 32f2a5b732b40a..67f53484dcbaec 100644
--- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
+++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
@@ -11,7 +11,7 @@
 
 #include "openvino/core/parallel.hpp"
 
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
 #    include <tbb/enumerable_thread_specific.h>
 #else
 #    include <functional>
@@ -25,7 +25,7 @@
 namespace ov {
 namespace threading {
 
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
 
 /**
  * @brief A wrapper class to keep object to be thread local.
diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp
index eb299728898968..26bbfb5c75d5cc 100644
--- a/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp
+++ b/src/inference/dev_api/openvino/runtime/threading/thread_safe_containers.hpp
@@ -13,7 +13,7 @@
 
 #include "openvino/core/parallel.hpp"
 
-#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))
+#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE))
 #    include <tbb/concurrent_priority_queue.h>
 #    include <tbb/concurrent_queue.h>
 #endif
@@ -47,7 +47,7 @@ class ThreadSafeQueueWithSize {
     std::queue<T> _queue;
     std::mutex _mutex;
 };
-#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))
+#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE))
 template <typename T>
 using ThreadSafeQueue = tbb::concurrent_queue<T>;
 template <typename T>
diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
index 9d63a0e078bdef..57c69b97ff003e 100644
--- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
+++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
@@ -26,6 +26,60 @@ namespace ov {
  */
 namespace intel_cpu {
 
+/**
+ * @enum       TbbPartitioner
+ * @brief      This enum contains definition of the type of TBB partitioner.
+ */
+enum class TbbPartitioner {
+    NONE = 0,    //!<  None value
+    STATIC = 1,  //!<  Static partitioner
+    AUTO = 2     //!<  Auto partitioner
+};
+
+/** @cond INTERNAL */
+inline std::ostream& operator<<(std::ostream& os, const TbbPartitioner& tbb_partitioner) {
+    switch (tbb_partitioner) {
+    case TbbPartitioner::STATIC:
+        return os << "STATIC";
+    case TbbPartitioner::AUTO:
+        return os << "AUTO";
+    case TbbPartitioner::NONE:
+        return os << "NONE";
+    default:
+        OPENVINO_THROW("Unsupported tbb partitioner!");
+    }
+}
+
+inline std::istream& operator>>(std::istream& is, TbbPartitioner& tbb_partitioner) {
+    std::string str;
+    is >> str;
+    if (str == "STATIC") {
+        tbb_partitioner = TbbPartitioner::STATIC;
+    } else if (str == "AUTO") {
+        tbb_partitioner = TbbPartitioner::AUTO;
+    } else if (str == "NONE") {
+        tbb_partitioner = TbbPartitioner::NONE;
+    } else {
+        OPENVINO_THROW("Unsupported tbb partitioner: ", str);
+    }
+    return is;
+}
+/** @endcond */
+
+/**
+ * @brief This property defines the type of TBB partitioner in parallel.
+ * @ingroup ov_runtime_cpp_prop_api
+ *
+ * Developer can use this property to select the type of TBB partitioner.
+ *
+ * The following code is an example to set auto partitioner. It is STATIC by default.
+ *
+ * @code
+ * ie.set_property(ov::intel_cpu::tbb_partitioner(ov::intel_cpu::TbbPartitioner::AUTO));
+ * @endcode
+ */
+static constexpr Property<TbbPartitioner> tbb_partitioner{"TBB_PARTITIONER"};
+
 /**
  * @brief This property define whether to perform denormals optimization.
  * @ingroup ov_runtime_cpu_prop_cpp_api
diff --git a/src/inference/src/dev/performance_heuristics.cpp b/src/inference/src/dev/performance_heuristics.cpp
index a0bf9f96fa8e4d..c49f1a5fdacc77 100644
--- a/src/inference/src/dev/performance_heuristics.cpp
+++ b/src/inference/src/dev/performance_heuristics.cpp
@@ -4,6 +4,7 @@
 
 #include "openvino/runtime/performance_heuristics.hpp"
 
+#include <cmath>
 namespace ov {
 
 MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::Model> model,
@@ -11,7 +12,13 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
                                                       const float mem_threshold_assume_limited,
                                                       const ov::element::Type& target_type) {
     int total_convs = 0, mem_limited_convs = 0, compute_convs = 0, total_gemms = 0, mem_limited_gemms = 0,
-        total_deconvs = 0, compute_deconvs = 0, mem_limited_deconvs = 0;
+        total_deconvs = 0, compute_deconvs = 0, mem_limited_deconvs = 0, total_adds = 0, mem_limited_adds = 0,
+        total_nodes = 0, total_heavy_convs = 0, total_light_convs = 0, total_light_gemms = 0;
+
+    constexpr int heavy_convs_threshold = 1 << 30;
+    constexpr int light_convs_threshold = 1 << 24;
+    constexpr int light_gemms_threshold = 1 << 17;
+
     auto memLimitedFactor = [&](size_t size_data_moved, int datatype_size = 4) -> float {
         return (cache_size / (size_data_moved * datatype_size));
     };
@@ -26,8 +33,11 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
     // Traverse OpenVINO Model in topological order
     for (auto& node : model->get_ordered_ops()) {
         const auto node_name = node->get_type_info().name;
+
+        total_nodes++;
+
         if (std::strcmp("MatMul", node_name) && std::strcmp("Convolution", node_name) &&
-            std::strcmp("ConvolutionBackpropData", node_name)) {
+            std::strcmp("Add", node_name) && std::strcmp("ConvolutionBackpropData", node_name)) {
             if (!std::strcmp("GRUSequence", node_name) || !std::strcmp("TensorIterator", node_name)) {
                 MemBandwidthPressure res;
                 res.max_mem_tolerance = MemBandwidthPressure::UNKNOWN;
@@ -65,6 +75,10 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
                 const auto factor = memLimitedFactor(total_data, data_type_size);
                 mem_limited_gemms += factor < mem_threshold_assume_limited;
                 worst_case = std::min(factor, worst_case);
+
+                if (dataSizeOutput * data_type_size < light_gemms_threshold) {
+                    total_light_gemms++;
+                }
             }
         } else if (!std::strcmp("Convolution", node_name)) {
             // Check that input and output shape a fully defined (not dynamic)
@@ -73,6 +87,24 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
             const auto kernels = node->input(1);
 
             total_convs++;
+
+            if (kernels.get_partial_shape().is_static() && output.get_partial_shape().is_static()) {
+                const auto& shapeOutput = output.get_shape();
+                const auto& shapeInput1 = kernels.get_shape();
+                dataSizeOutput =
+                    std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies<size_t>());
+                auto conv_indicator = dataSizeOutput * data_type_size;
+                for (size_t n = 1; n < shapeInput1.size(); n++) {
+                    conv_indicator = conv_indicator * shapeInput1[n];
+                }
+                if (conv_indicator < light_convs_threshold) {
+                    total_light_convs++;
+                }
+                if (conv_indicator > heavy_convs_threshold) {
+                    total_heavy_convs++;
+                }
+            }
+
             if (kernels.get_partial_shape().is_static()) {
                 const auto& shape = kernels.get_shape();
                 if (shape.size() >= 4 /* conventional 2D/3D conv */ && shape[2] >= 3 && shape[3] >= 3) {
@@ -83,6 +115,7 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
             if (input.get_partial_shape().is_static() && output.get_partial_shape().is_static()) {
                 const auto& shapeInput = input.get_shape();
                 const auto& shapeOutput = output.get_shape();
+
                 if (shapeInput.size() > 4 /*5D*/ && isINT8) {
                     compute_convs++;
                     continue;
@@ -90,7 +123,9 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
                 dataSizeInput =
                     std::accumulate(shapeInput.begin(), shapeInput.end(), size_t(1), std::multiplies<size_t>());
                 dataSizeOutput =
-                    std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies<size_t>());
+                    dataSizeOutput == 0
+                        ? std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies<size_t>())
+                        : dataSizeOutput;
                 const auto factor = memLimitedFactor(static_cast<int>(dataSizeInput + dataSizeOutput), data_type_size);
                 mem_limited_convs += factor < mem_threshold_assume_limited;
                 worst_case = std::min(factor, worst_case);
@@ -116,6 +151,21 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
                 mem_limited_deconvs += factor < mem_threshold_assume_limited;
                 worst_case = std::min(factor, worst_case);
             }
+        } else if (!std::strcmp("Add", node_name)) {
+            const auto input = node->input(0);
+            const auto output = node->output(0);
+            // Check that input and output shape a fully defined (not dynamic)
+            if (input.get_partial_shape().is_static() && output.get_partial_shape().is_static()) {
+                const auto& shapeInput = input.get_shape();
+                const auto& shapeOutput = output.get_shape();
+                total_adds++;
+                dataSizeInput =
+                    std::accumulate(shapeInput.begin(), shapeInput.end(), size_t(1), std::multiplies<size_t>());
+                dataSizeOutput =
+                    std::accumulate(shapeOutput.begin(), shapeOutput.end(), size_t(1), std::multiplies<size_t>());
+                const auto factor = memLimitedFactor(static_cast<int>(dataSizeInput + dataSizeOutput), data_type_size);
+                mem_limited_adds += factor < mem_threshold_assume_limited;
+            }
         }
     }
     MemBandwidthPressure res;
@@ -123,8 +173,16 @@ MemBandwidthPressure mem_bandwidth_pressure_tolerance(const std::shared_ptr<ov::
     res.ratio_mem_limited_convs = total_convs ? static_cast<float>(mem_limited_convs) / total_convs : 0;
     res.ratio_mem_limited_deconvs = total_deconvs ? static_cast<float>(mem_limited_deconvs) / total_deconvs : 0;
     res.ratio_mem_limited_gemms = total_gemms ? static_cast<float>(mem_limited_gemms) / total_gemms : 0;
+    res.ratio_mem_limited_adds = total_adds ? static_cast<float>(mem_limited_adds) / total_adds : 0;
     res.ratio_compute_convs = total_convs ? static_cast<float>(compute_convs) / total_convs : 0;
     res.ratio_compute_deconvs = total_deconvs ? static_cast<float>(compute_deconvs) / total_deconvs : 0;
+    res.total_gemms = total_gemms;
+    res.total_convs = total_convs;
+    res.total_adds = total_adds;
+    res.total_heavy_convs = total_heavy_convs;
+    res.total_light_convs = total_light_convs;
+    res.total_light_gemms = total_light_gemms;
+    res.total_nodes = total_nodes;
     return res;
 }
 
diff --git a/src/inference/src/dev/threading/cpu_streams_executor.cpp b/src/inference/src/dev/threading/cpu_streams_executor.cpp
index 1fbab9ba5f29b0..10755de34fff69 100644
--- a/src/inference/src/dev/threading/cpu_streams_executor.cpp
+++ b/src/inference/src/dev/threading/cpu_streams_executor.cpp
@@ -26,7 +26,7 @@ namespace ov {
 namespace threading {
 struct CPUStreamsExecutor::Impl {
     struct Stream {
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
         struct Observer : public custom::task_scheduler_observer {
             CpuSet _mask;
             int _ncpus = 0;
@@ -66,7 +66,7 @@ struct CPUStreamsExecutor::Impl {
                                                ((_impl->_config.get_streams() + _impl->_usedNumaNodes.size() - 1) /
                                                 _impl->_usedNumaNodes.size()))
                     : _impl->_usedNumaNodes.at(_streamId % _impl->_usedNumaNodes.size());
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
             if (_impl->_config.get_streams_info_table().size() > 0) {
                 init_stream();
             }
@@ -91,14 +91,14 @@ struct CPUStreamsExecutor::Impl {
                 std::lock_guard<std::mutex> lock{_impl->_streamIdMutex};
                 _impl->_streamIdQueue.push(_streamId);
             }
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
             if (nullptr != _observer) {
                 _observer->observe(false);
             }
 #endif
         }
 
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
         void create_tbb_task_arena(const int stream_id,
                                    const StreamCreateType stream_type,
                                    const int concurrency,
@@ -219,7 +219,7 @@ struct CPUStreamsExecutor::Impl {
         bool _execute = false;
         std::vector<int> _rank;
         std::queue<Task> _taskQueue;
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
         std::unique_ptr<custom::task_arena> _taskArena;
         std::unique_ptr<Observer> _observer;
         std::vector<int> _cpu_ids;
@@ -408,7 +408,7 @@ struct CPUStreamsExecutor::Impl {
     }
 
     void Execute(const Task& task, Stream& stream) {
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
         auto& arena = stream._taskArena;
         if (nullptr != arena) {
             arena->execute(std::move(task));
diff --git a/src/inference/src/dev/threading/executor_manager.cpp b/src/inference/src/dev/threading/executor_manager.cpp
index ae6c9ef7fa40d1..c9c9ed6fe9c7ab 100644
--- a/src/inference/src/dev/threading/executor_manager.cpp
+++ b/src/inference/src/dev/threading/executor_manager.cpp
@@ -7,7 +7,7 @@
 #include "openvino/core/parallel.hpp"
 #include "openvino/runtime/properties.hpp"
 #include "openvino/runtime/threading/cpu_streams_executor.hpp"
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
 #    if (TBB_INTERFACE_VERSION < 12000)
 #        include <tbb/task_scheduler_init.h>
 #    else
@@ -47,7 +47,7 @@ class ExecutorManagerImpl : public ExecutorManager {
     bool tbbTerminateFlag = false;
     mutable std::mutex global_mutex;
     bool tbbThreadsCreated = false;
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
 #    if (TBB_INTERFACE_VERSION < 12000)
     std::shared_ptr<tbb::task_scheduler_init> tbbTaskScheduler = nullptr;
 #    else
@@ -67,7 +67,7 @@ void ExecutorManagerImpl::set_property(const ov::AnyMap& properties) {
     for (const auto& it : properties) {
         if (it.first == ov::force_tbb_terminate.name()) {
             tbbTerminateFlag = it.second.as<bool>();
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
             if (tbbTerminateFlag) {
                 if (!tbbTaskScheduler) {
 #    if (TBB_INTERFACE_VERSION < 12000)
@@ -97,7 +97,7 @@ ov::Any ExecutorManagerImpl::get_property(const std::string& name) const {
 void ExecutorManagerImpl::reset_tbb() {
     std::lock_guard<std::mutex> guard(global_mutex);
     if (tbbTerminateFlag) {
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
         if (tbbTaskScheduler && tbbThreadsCreated) {
 #    if (TBB_INTERFACE_VERSION < 12000)
             tbbTaskScheduler->terminate();
diff --git a/src/inference/src/dev/threading/parallel_custom_arena.cpp b/src/inference/src/dev/threading/parallel_custom_arena.cpp
index 7db44acb6fa9d5..4a25e206c3a0ad 100644
--- a/src/inference/src/dev/threading/parallel_custom_arena.cpp
+++ b/src/inference/src/dev/threading/parallel_custom_arena.cpp
@@ -7,7 +7,7 @@
 
 #include "dev/threading/itt.hpp"
 
-#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
+#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE
 
 #    define TBB_NUMA_SUPPORT_PRESENT (TBB_INTERFACE_VERSION >= 11100)
 #    if defined(__APPLE__)
@@ -332,4 +332,5 @@ int default_concurrency(numa_node_id id) {
 
 }  // namespace info
 }  // namespace custom
-#endif /*OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO*/
+#endif /*OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == \
+          OV_THREAD_TBB_ADAPTIVE*/
diff --git a/src/inference/src/dev/threading/parallel_custom_arena.hpp b/src/inference/src/dev/threading/parallel_custom_arena.hpp
index 6ac3864c0fc220..e24d8194568216 100644
--- a/src/inference/src/dev/threading/parallel_custom_arena.hpp
+++ b/src/inference/src/dev/threading/parallel_custom_arena.hpp
@@ -13,7 +13,7 @@
 #include "openvino/core/parallel.hpp"
 #include "openvino/runtime/common.hpp"
 
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
 
 #    include <cstddef>
 #    include <memory>
@@ -173,5 +173,4 @@ int default_concurrency(numa_node_id id = task_arena::automatic);
 int default_concurrency(task_arena::constraints c);
 }  // namespace info
 }  // namespace custom
-#endif /*(OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)*/
-
+#endif /*(OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE)*/
diff --git a/src/inference/src/os/cpu_map_info.hpp b/src/inference/src/os/cpu_map_info.hpp
index 097057bc054b28..be6f3c587118cb 100644
--- a/src/inference/src/os/cpu_map_info.hpp
+++ b/src/inference/src/os/cpu_map_info.hpp
@@ -82,6 +82,7 @@ void parse_node_info_linux(const std::vector<std::string> node_info_table,
  * @param[out] _numa_nodes total number for nodes in system
  * @param[out] _sockets total number for sockets in system
  * @param[out] _cores total number for physical CPU cores in system
+ * @param[out] _blocked_cores total number for blocked processors in system
  * @param[out] _proc_type_table summary table of number of processors per type
  * @param[out] _cpu_mapping_table CPU mapping table for each processor
  * @return
@@ -92,6 +93,7 @@ void parse_cache_info_linux(const std::vector<std::vector<std::string>> system_i
                             int& _numa_nodes,
                             int& _sockets,
                             int& _cores,
+                            int& _blocked_cores,
                             std::vector<std::vector<int>>& _proc_type_table,
                             std::vector<std::vector<int>>& _cpu_mapping_table);
 
diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp
index 165f666bcbf2aa..2b58599e129ed5 100644
--- a/src/inference/src/os/lin/lin_system_conf.cpp
+++ b/src/inference/src/os/lin/lin_system_conf.cpp
@@ -248,6 +248,7 @@ CPU::CPU() {
                                _numa_nodes,
                                _sockets,
                                _cores,
+                               _blocked_cores,
                                _proc_type_table,
                                _cpu_mapping_table);
     }
@@ -400,6 +401,7 @@ void parse_cache_info_linux(const std::vector<std::vector<std::string>> system_i
                             int& _numa_nodes,
                             int& _sockets,
                             int& _cores,
+                            int& _blocked_cores,
                             std::vector<std::vector<int>>& _proc_type_table,
                             std::vector<std::vector<int>>& _cpu_mapping_table) {
     int n_group = 0;
@@ -486,6 +488,16 @@ void parse_cache_info_linux(const std::vector<std::vector<std::string>> system_i
 
                 for (int m = core_1; m <= core_2; m++) {
                     update_proc_info(m, core_type);
+
+                    if ((core_2 - core_1 == 1) &&
+                        _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] == LP_EFFICIENT_CORE_PROC) {
+                        _cpu_mapping_table[m][CPU_MAP_GROUP_ID] = CPU_BLOCKED;
+                        _cpu_mapping_table[m][CPU_MAP_USED_FLAG] = CPU_BLOCKED;
+                        _blocked_cores++;
+                        _cores--;
+                        _proc_type_table[0][ALL_PROC]--;
+                        _proc_type_table[0][_cpu_mapping_table[m][CPU_MAP_CORE_TYPE]]--;
+                    }
                 }
             } else {
                 core_1 = std::stoi(system_info_table[nproc][0]);
@@ -544,11 +556,6 @@ void parse_cache_info_linux(const std::vector<std::vector<std::string>> system_i
                         sub_str = system_info_table[n][info_index].substr(endpos + 1);
                         core_2 = std::stoi(sub_str);
 
-                        if ((info_index == 1) && (core_2 - core_1 == 1) &&
-                            (_proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) {
-                            offline_list.push_back(n);
-                            break;
-                        }
                         for (int m = core_1; m <= core_2; m++) {
                             _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets;
                             _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID];
@@ -607,6 +614,8 @@ void parse_cache_info_linux(const std::vector<std::vector<std::string>> system_i
         _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n);
         _processors--;
     }
+
+    _processors = _processors - _blocked_cores;
 };
 
 void get_cpu_mapping_from_cores(const int _processors,
diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp
index 8786ac601e7cf9..31508c6efecce2 100644
--- a/src/inference/src/os/win/win_system_conf.cpp
+++ b/src/inference/src/os/win/win_system_conf.cpp
@@ -318,7 +318,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) {
         phys_cores++;
     } while (offset < sz);
 
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
     auto core_types = custom::info::core_types();
     if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ {
         phys_cores = custom::info::default_concurrency(
@@ -328,7 +328,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) {
     return phys_cores;
 }
 
-#if !(OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if !OV_THREAD_USE_TBB
 // OMP/SEQ threading on the Windows doesn't support NUMA
 std::vector<int> get_available_numa_nodes() {
     return {-1};
diff --git a/src/inference/src/system_conf.cpp b/src/inference/src/system_conf.cpp
index 318fbf94c65510..dd47185947e381 100644
--- a/src/inference/src/system_conf.cpp
+++ b/src/inference/src/system_conf.cpp
@@ -291,7 +291,7 @@ CPU& cpu_info() {
 int get_number_of_cpu_cores(bool) {
     return parallel_get_max_threads();
 }
-#    if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))
+#    if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE))
 std::vector<int> get_available_numa_nodes() {
     return {-1};
 }
@@ -346,7 +346,7 @@ int get_org_numa_id(int numa_node_id) {
 int get_number_of_cpu_cores(bool) {
     return parallel_get_max_threads();
 }
-#    if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))
+#    if !((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE))
 std::vector<int> get_available_numa_nodes() {
     return {-1};
 }
@@ -420,7 +420,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) {
     OPENVINO_ASSERT(totalNumberOfCpuCores != 0, "Total number of cpu cores can not be 0.");
 
     int phys_cores = totalNumberOfCpuCores;
-#        if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#        if OV_THREAD_USE_TBB
     auto core_types = custom::info::core_types();
     if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ {
         phys_cores = custom::info::default_concurrency(
@@ -430,7 +430,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) {
     return phys_cores;
 }
 
-#        if !((OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO))
+#        if !((OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE))
 std::vector<int> get_available_numa_nodes() {
     CPU& cpu = cpu_info();
     std::vector<int> nodes((0 == cpu._numa_nodes) ? 1 : cpu._numa_nodes);
@@ -558,7 +558,7 @@ void set_cpu_used(const std::vector<int>& cpu_ids, const int used) {
 
 int get_number_of_logical_cpu_cores(bool bigCoresOnly) {
     int logical_cores = parallel_get_max_threads();
-#    if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#    if OV_THREAD_USE_TBB
     auto core_types = custom::info::core_types();
     if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ {
         logical_cores = custom::info::default_concurrency(
@@ -592,7 +592,7 @@ int get_org_numa_id(int numa_node_id) {
 }
 #endif
 
-#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))
+#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO) || (OV_THREAD == OV_THREAD_TBB_ADAPTIVE))
 std::vector<int> get_available_numa_nodes() {
     return custom::info::numa_nodes();
 }
diff --git a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp
index 9a200e20c0fa51..09e7854db98d1a 100644
--- a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp
+++ b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp
@@ -20,6 +20,7 @@ struct LinuxCpuMapTestCase {
     int _numa_nodes;
     int _sockets;
     int _cores;
+    int _blocked_cores;
     std::vector<std::vector<int>> _proc_type_table;
     std::vector<std::vector<int>> _cpu_mapping_table;
     std::vector<std::vector<std::string>> system_info_table;
@@ -36,6 +37,7 @@ class LinuxCpuMapCacheParserTests : public ov::test::TestsCommon,
         int test_numa_nodes = 0;
         int test_sockets = 0;
         int test_cores = 0;
+        int test_blocked_cores = 0;
         std::vector<std::vector<int>> test_proc_type_table;
         std::vector<std::vector<int>> test_cpu_mapping_table;
 
@@ -45,6 +47,7 @@ class LinuxCpuMapCacheParserTests : public ov::test::TestsCommon,
                                    test_numa_nodes,
                                    test_sockets,
                                    test_cores,
+                                   test_blocked_cores,
                                    test_proc_type_table,
                                    test_cpu_mapping_table);
 
@@ -52,6 +55,7 @@ class LinuxCpuMapCacheParserTests : public ov::test::TestsCommon,
         ASSERT_EQ(test_data._numa_nodes, test_numa_nodes);
         ASSERT_EQ(test_data._sockets, test_sockets);
         ASSERT_EQ(test_data._cores, test_cores);
+        ASSERT_EQ(test_data._blocked_cores, test_blocked_cores);
         ASSERT_EQ(test_data._proc_type_table, test_proc_type_table);
         ASSERT_EQ(test_data._cpu_mapping_table, test_cpu_mapping_table);
     }
@@ -82,6 +86,7 @@ LinuxCpuMapTestCase cache_2sockets_104cores_hyperthreading = {
     2,    // param[expected out]: total 2 numa nodes on this simulated platform
     2,    // param[expected out]: total 2 sockets on this simulated platform
     104,  // param[expected out]: total 104 CPU cores on this simulated platform
+    0,    // param[expected out]: total 0 processes on this simulated platform
     {{208, 104, 0, 0, 104, -1, -1},
      {104, 52, 0, 0, 52, 0, 0},
      {104, 52, 0, 0, 52, 1, 1}},  // param[expected out]: The proc_type_table of this simulated platform
@@ -304,6 +309,7 @@ LinuxCpuMapTestCase cache_1sockets_96cores = {
     1,
     1,
     96,
+    0,
     {{96, 0, 96, 0, 0, 0, 0}},
     {
         {0, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1},    {1, 0, 0, 1, EFFICIENT_CORE_PROC, 0, -1},
@@ -390,6 +396,7 @@ LinuxCpuMapTestCase cache_2sockets_56cores_hyperthreading = {
     2,
     2,
     56,
+    0,
     {{110, 56, 0, 0, 54, -1, -1}, {54, 28, 0, 0, 26, 0, 0}, {56, 28, 0, 0, 28, 1, 1}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
@@ -572,6 +579,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_hyperthreading = {
     2,
     2,
     48,
+    0,
     {{96, 48, 0, 0, 48, -1, -1}, {48, 24, 0, 0, 24, 0, 0}, {48, 24, 0, 0, 24, 1, 1}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
@@ -664,6 +672,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_hyperthreading_1 = {
     4,
     2,
     48,
+    0,
     {{96, 48, 0, 0, 48, -1, -1},
      {24, 12, 0, 0, 12, 0, 0},
      {24, 12, 0, 0, 12, 1, 0},
@@ -760,6 +769,7 @@ LinuxCpuMapTestCase cache_2sockets_24cores_hyperthreading = {
     2,
     2,
     24,
+    0,
     {{48, 24, 0, 0, 24, -1, -1}, {24, 12, 0, 0, 12, 0, 0}, {24, 12, 0, 0, 12, 1, 1}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 1, 1, 12, HYPER_THREADING_PROC, 12, -1},
@@ -845,6 +855,7 @@ LinuxCpuMapTestCase cache_2sockets_24cores_hyperthreading_1 = {
     4,
     2,
     24,
+    0,
     {{48, 24, 0, 0, 24, -1, -1},
      {12, 6, 0, 0, 6, 0, 0},
      {12, 6, 0, 0, 6, 1, 0},
@@ -936,6 +947,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores = {
     2,
     2,
     48,
+    0,
     {{48, 48, 0, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0, 0}, {24, 24, 0, 0, 0, 1, 1}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},    {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
@@ -988,6 +1000,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_1 = {
     2,
     2,
     48,
+    0,
     {{48, 48, 0, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0, 0}, {24, 24, 0, 0, 0, 1, 1}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},    {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
@@ -1040,6 +1053,7 @@ LinuxCpuMapTestCase cache_2sockets_48cores_2 = {
     4,
     2,
     48,
+    0,
     {{48, 48, 0, 0, 0, -1, -1},
      {12, 12, 0, 0, 0, 0, 0},
      {12, 12, 0, 0, 0, 1, 0},
@@ -1096,6 +1110,7 @@ LinuxCpuMapTestCase cache_2sockets_20cores_hyperthreading = {
     2,
     2,
     20,
+    0,
     {{40, 20, 0, 0, 20, -1, -1}, {20, 10, 0, 0, 10, 0, 0}, {20, 10, 0, 0, 10, 1, 1}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
@@ -1148,6 +1163,7 @@ LinuxCpuMapTestCase cache_2sockets_20cores_hyperthreading_1 = {
     2,
     2,
     20,
+    0,
     {{40, 20, 0, 0, 20, -1, -1}, {20, 10, 0, 0, 10, 0, 0}, {20, 10, 0, 0, 10, 1, 1}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
@@ -1200,18 +1216,31 @@ LinuxCpuMapTestCase cache_1sockets_16cores_hyperthreading = {
     1,
     1,
     14,
+    2,
     {{20, 6, 8, 0, 6, 0, 0}},
     {
-        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},  {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
-        {2, 0, 0, 1, MAIN_CORE_PROC, 1, -1},        {3, 0, 0, 2, HYPER_THREADING_PROC, 2, -1},
-        {4, 0, 0, 2, MAIN_CORE_PROC, 2, -1},        {5, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
-        {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1},  {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1},
-        {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1},  {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1},
-        {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1},
-        {12, 0, 0, 6, EFFICIENT_CORE_PROC, 6, -1},  {13, 0, 0, 7, EFFICIENT_CORE_PROC, 6, -1},
-        {14, 0, 0, 8, EFFICIENT_CORE_PROC, 6, -1},  {15, 0, 0, 9, EFFICIENT_CORE_PROC, 6, -1},
-        {16, 0, 0, 10, EFFICIENT_CORE_PROC, 7, -1}, {17, 0, 0, 11, EFFICIENT_CORE_PROC, 7, -1},
-        {18, 0, 0, 12, EFFICIENT_CORE_PROC, 7, -1}, {19, 0, 0, 13, EFFICIENT_CORE_PROC, 7, -1},
+        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
+        {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1},
+        {2, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
+        {3, 0, 0, 2, HYPER_THREADING_PROC, 2, -1},
+        {4, 0, 0, 2, MAIN_CORE_PROC, 2, -1},
+        {5, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
+        {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1},
+        {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1},
+        {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1},
+        {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1},
+        {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1},
+        {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1},
+        {12, 0, 0, 6, EFFICIENT_CORE_PROC, 6, -1},
+        {13, 0, 0, 7, EFFICIENT_CORE_PROC, 6, -1},
+        {14, 0, 0, 8, EFFICIENT_CORE_PROC, 6, -1},
+        {15, 0, 0, 9, EFFICIENT_CORE_PROC, 6, -1},
+        {16, 0, 0, 10, EFFICIENT_CORE_PROC, 7, -1},
+        {17, 0, 0, 11, EFFICIENT_CORE_PROC, 7, -1},
+        {18, 0, 0, 12, EFFICIENT_CORE_PROC, 7, -1},
+        {19, 0, 0, 13, EFFICIENT_CORE_PROC, 7, -1},
+        {20, 0, 0, 14, LP_EFFICIENT_CORE_PROC, -100, -100},
+        {21, 0, 0, 14, LP_EFFICIENT_CORE_PROC, -100, -100},
     },
     {
         {"0,5", "0,5", "0-19"},  {"1-2", "1-2", "0-19"},  {"1-2", "1-2", "0-19"},     {"3-4", "3-4", "0-19"},
@@ -1230,6 +1259,7 @@ LinuxCpuMapTestCase cache_1sockets_16cores = {
     1,
     1,
     16,
+    0,
     {{16, 4, 8, 4, 0, 0, 0}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1276,6 +1306,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores_hyperthreading = {
     1,
     1,
     14,
+    0,
     {{20, 6, 8, 0, 6, 0, 0}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},  {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1305,6 +1336,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores_hyperthreading_1 = {
     1,
     1,
     14,
+    0,
     {{20, 6, 8, 0, 6, 0, 0}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},  {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1334,6 +1366,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores = {
     1,
     1,
     9,
+    0,
     {{9, 1, 8, 0, 0, 0, 0}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1369,6 +1402,7 @@ LinuxCpuMapTestCase cache_1sockets_14cores_2 = {
     1,
     1,
     8,
+    0,
     {{8, 0, 8, 0, 0, 0, 0}},
     {
         {6, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1},
@@ -1403,6 +1437,7 @@ LinuxCpuMapTestCase cache_1sockets_10cores_hyperthreading = {
     1,
     1,
     10,
+    0,
     {{12, 2, 8, 0, 2, 0, 0}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -1439,6 +1474,7 @@ LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading = {
     1,
     1,
     8,
+    0,
     {{12, 4, 4, 0, 4, 0, 0}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -1475,6 +1511,7 @@ LinuxCpuMapTestCase cache_1sockets_8cores = {
     1,
     1,
     8,
+    0,
     {{8, 4, 0, 4, 0, 0, 0}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1505,6 +1542,7 @@ LinuxCpuMapTestCase cache_1sockets_8cores_1 = {
     1,
     1,
     8,
+    0,
     {{8, 8, 0, 0, 0, 0, 0}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1533,6 +1571,7 @@ LinuxCpuMapTestCase cache_1sockets_6cores_hyperthreading = {
     1,
     1,
     6,
+    0,
     {{12, 6, 0, 0, 6, 0, 0}},
     {
         {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -1569,6 +1608,7 @@ LinuxCpuMapTestCase cache_1sockets_4cores = {
     1,
     1,
     4,
+    0,
     {{4, 4, 0, 0, 0, 0, 0}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1589,6 +1629,7 @@ LinuxCpuMapTestCase cache_1sockets_4cores_1 = {
     1,
     1,
     4,
+    0,
     {{4, 0, 4, 0, 0, 0, 0}},
     {
         {0, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1},
@@ -1609,6 +1650,7 @@ LinuxCpuMapTestCase cache_1sockets_4cores_2 = {
     1,
     1,
     4,
+    0,
     {{4, 4, 0, 0, 0, 0, 0}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1631,6 +1673,7 @@ LinuxCpuMapTestCase cache_VM_cache_0 = {
     0,
     0,
     0,
+    0,
     {},
     {},
     {
@@ -1653,6 +1696,7 @@ LinuxCpuMapTestCase cache_mock_0 = {
     1,
     1,
     8,
+    0,
     {{8, 0, 8, 0, 0, 0, 0}},
     {
         {6, 0, 0, 0, EFFICIENT_CORE_PROC, 0, -1},
@@ -1687,6 +1731,7 @@ LinuxCpuMapTestCase cache_mock_1 = {
     1,
     1,
     2,
+    0,
     {{2, 2, 0, 0, 0, 0, 0}},
     {
         {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
index 82188856c853d7..aa43ddc28b5676 100644
--- a/src/plugins/intel_cpu/src/compiled_model.cpp
+++ b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -15,6 +15,7 @@
 
 #include "async_infer_request.h"
 #include "config.h"
+#include "cpu_parallel.hpp"
 #include "graph.h"
 #include "graph_context.h"
 #include "infer_request.h"
@@ -198,10 +199,12 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const {
                     std::lock_guard<std::mutex> lock{*m_mutex};
                     auto isQuantizedFlag = (m_cfg.lpTransformsMode == Config::On) &&
                                            ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model);
+                    auto cpuParallel = std::make_shared<CpuParallel>(m_cfg.tbbPartitioner);
                     ctx = std::make_shared<GraphContext>(m_cfg,
                                                          m_socketWeights[socketId],
                                                          isQuantizedFlag,
                                                          streamsExecutor,
+                                                         cpuParallel,
                                                          m_sub_memory_manager);
                 }
 
@@ -298,6 +301,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
             RO_property(ov::log::level.name()),
             RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
             RO_property(ov::intel_cpu::enable_tensor_parallel.name()),
+            RO_property(ov::intel_cpu::tbb_partitioner.name()),
             RO_property(ov::hint::dynamic_quantization_group_size.name()),
             RO_property(ov::hint::kv_cache_precision.name()),
             RO_property(ov::key_cache_precision.name()),
@@ -380,6 +384,9 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
         const auto& enable_tensor_parallel = config.enableTensorParallel;
         return enable_tensor_parallel;
     }
+    if (name == ov::intel_cpu::tbb_partitioner) {
+        return config.tbbPartitioner;
+    }
     if (name == ov::hint::dynamic_quantization_group_size) {
         return static_cast<decltype(ov::hint::dynamic_quantization_group_size)::value_type>(
             config.fcDynamicQuantizationGroupSize);
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index dce25d5f24f080..8de4159bff4773 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -192,6 +192,16 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                             ov::intel_cpu::sparse_weights_decompression_rate.name(),
                             ". Sparse rate must be in range [0.0f,1.0f]");
             fcSparseWeiDecompressionRate = val_f;
+        } else if (key == ov::intel_cpu::tbb_partitioner.name()) {
+            try {
+                tbbPartitioner = val.as<ov::intel_cpu::TbbPartitioner>();
+            } catch (ov::Exception&) {
+                OPENVINO_THROW("Wrong value ",
+                               val.as<std::string>(),
+                               "for property key ",
+                               ov::intel_cpu::tbb_partitioner.name(),
+                               ". Expected only ov::intel_cpu::TbbPartitioner::STATIC/AUTO");
+            }
         } else if (key == ov::hint::dynamic_quantization_group_size.name()) {
             try {
                 fcDynamicQuantizationGroupSizeSetExplicitly = true;
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
index 68ea781a204c34..4b467c0dd0643f 100644
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@@ -16,6 +16,7 @@
 #include "openvino/core/any.hpp"
 #include "openvino/core/attribute_visitor.hpp"
 #include "openvino/core/type/element_type.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
 #include "openvino/runtime/properties.hpp"
 #include "openvino/runtime/threading/istreams_executor.hpp"
 #include "utils/debug_caps_config.h"
@@ -100,6 +101,7 @@ struct Config {
     bool changedCpuPinning = false;
     bool enableCpuReservation = false;
     ov::hint::SchedulingCoreType schedulingCoreType = ov::hint::SchedulingCoreType::ANY_CORE;
+    ov::intel_cpu::TbbPartitioner tbbPartitioner = ov::intel_cpu::TbbPartitioner::NONE;
     std::set<ov::hint::ModelDistributionPolicy> modelDistributionPolicy;
     bool enableTensorParallel = false;
     int streamsRankLevel = 1;
@@ -134,6 +136,8 @@ struct Config {
     std::map<std::string, std::string> _config;
 
     int modelPreferThreads = -1;
+    int modelPreferThreadsLatency = 0;
+    int modelPreferThreadsThroughput = 0;
     ModelType modelType = ModelType::Unknown;
     std::function<std::string(const std::string&)> cacheEncrypt;
     std::function<std::string(const std::string&)> cacheDecrypt;
diff --git a/src/plugins/intel_cpu/src/cpu_parallel.cpp b/src/plugins/intel_cpu/src/cpu_parallel.cpp
new file mode 100644
index 00000000000000..c63786d1eee57d
--- /dev/null
+++ b/src/plugins/intel_cpu/src/cpu_parallel.cpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "cpu_parallel.hpp"
+
+#include <cstddef>
+#include <memory>
+
+#include "openvino/runtime/intel_cpu/properties.hpp"
+#include "thread_pool_imp.hpp"
+
+namespace ov::intel_cpu {
+CpuParallel::CpuParallel(ov::intel_cpu::TbbPartitioner partitioner, size_t multiplier)
+    : m_partitioner(partitioner),
+      m_multiplier(multiplier) {
+    m_partitioner =
+        m_partitioner == ov::intel_cpu::TbbPartitioner::NONE ? ov::intel_cpu::TbbPartitioner::STATIC : m_partitioner;
+    m_thread_pool = std::make_shared<ThreadPool>(*this);
+}
+
+}  // namespace ov::intel_cpu
diff --git a/src/plugins/intel_cpu/src/cpu_parallel.hpp b/src/plugins/intel_cpu/src/cpu_parallel.hpp
new file mode 100644
index 00000000000000..d63434ff975204
--- /dev/null
+++ b/src/plugins/intel_cpu/src/cpu_parallel.hpp
@@ -0,0 +1,360 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <oneapi/dnnl/dnnl_threadpool.h>
+
+#include "openvino/core/parallel.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
+
+namespace ov::intel_cpu {
+class ThreadPool;
+
+class CpuParallel {
+public:
+    // Default multiplier for the number of virtual threads when tbb partitioner is AUTO. This value is determined
+    // empirically.
+    static constexpr int default_multiplier = 32;
+
+    CpuParallel() = delete;
+    CpuParallel(CpuParallel&) = delete;
+    CpuParallel(ov::intel_cpu::TbbPartitioner partitioner = ov::intel_cpu::TbbPartitioner::STATIC,
+                size_t multiplier = default_multiplier);
+    ~CpuParallel() = default;
+
+    [[nodiscard]] ov::intel_cpu::TbbPartitioner get_partitioner() const {
+        return m_partitioner;
+    }
+    [[nodiscard]] size_t get_multiplier() const {
+        return m_multiplier;
+    }
+    [[nodiscard]] std::shared_ptr<ThreadPool> get_thread_pool() const {
+        return m_thread_pool;
+    }
+    [[nodiscard]] int get_num_threads() const {
+        int num = m_partitioner == ov::intel_cpu::TbbPartitioner::STATIC ? parallel_get_max_threads()
+                                                                         : parallel_get_max_threads() * m_multiplier;
+        return num;
+    }
+    void activate() const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        dnnl_threadpool_interop_set_max_concurrency(get_num_threads());
+#endif
+    }
+
+    template <typename T0, typename F>
+    void parallel_simple(const T0 D0, const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        const auto nthr = D0;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            tbb::parallel_for(0, nthr, [&](int ithr) {
+                func(ithr, nthr);
+            });
+        } else {
+            tbb::parallel_for(
+                0,
+                nthr,
+                [&](int ithr) {
+                    func(ithr, nthr);
+                },
+                tbb::static_partitioner());
+        }
+#else
+        ov::parallel_for(D0, func);
+#endif
+    }
+
+    template <typename T0, typename R, typename F>
+    [[nodiscard]] R parallel_sum(const T0& D0, const R& input, const F& func) const {
+        return cpu_parallel_sum(D0, input, func);
+    }
+    template <typename T0, typename F>
+    void parallel_for(const T0& D0, const F& func) const {
+        cpu_parallel_for(D0, func);
+    }
+    template <typename T0, typename T1, typename F>
+    void parallel_for2d(const T0& D0, const T1& D1, const F& func) const {
+        cpu_parallel_for2d(D0, D1, func);
+    }
+    template <typename T0, typename T1, typename T2, typename F>
+    void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) const {
+        cpu_parallel_for3d(D0, D1, D2, func);
+    }
+    template <typename T0, typename T1, typename T2, typename T3, typename F>
+    void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) const {
+        cpu_parallel_for4d(D0, D1, D2, D3, func);
+    }
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
+    void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const F& func) const {
+        cpu_parallel_for5d(D0, D1, D2, D3, D4, func);
+    }
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename F>
+    void parallel_for6d(const T0& D0,
+                        const T1& D1,
+                        const T2& D2,
+                        const T3& D3,
+                        const T4& D4,
+                        const T5& D5,
+                        const F& func) const {
+        cpu_parallel_for6d(D0, D1, D2, D3, D4, D5, func);
+    }
+
+private:
+    template <typename T0, typename R, typename F>
+    [[nodiscard]] R cpu_parallel_sum(const T0& D0, const R& input, const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        R res_sum = 0;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            res_sum = _TBB_REDUCE_FUNC(
+                tbb::blocked_range<T0>(0, D0),
+                input,
+                [&](const tbb::blocked_range<T0>& r, R init) -> R {
+                    R sum = init;
+                    for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1) {
+                        sum += func(dim1);
+                    }
+                    return sum;
+                },
+                [](R x, R y) -> R {
+                    return x + y;
+                });
+        } else {
+            res_sum = _TBB_REDUCE_FUNC(
+                tbb::blocked_range<T0>(0, D0),
+                input,
+                [&](const tbb::blocked_range<T0>& r, R init) -> R {
+                    R sum = init;
+                    for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1) {
+                        sum += func(dim1);
+                    }
+                    return sum;
+                },
+                [](R x, R y) -> R {
+                    return x + y;
+                },
+                tbb::static_partitioner());
+        }
+        return res_sum;
+#else
+        return ov::parallel_sum(D0, input, func);
+#endif
+    }
+
+    template <typename T0, typename F>
+    void cpu_parallel_for(const T0& D0, const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        auto work_amount = static_cast<int>(D0);
+        const int nthr = parallel_get_max_threads();
+        int virtual_threads = nthr;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier;
+        }
+        if (virtual_threads > work_amount) {
+            virtual_threads = work_amount;
+        }
+        if (virtual_threads == 1) {
+            for_1d(0, 1, D0, func);
+        } else {
+            if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+                tbb::parallel_for(0, virtual_threads, [&](int ithr) {
+                    for_1d(ithr, virtual_threads, D0, func);
+                });
+            } else {
+                tbb::parallel_for(
+                    0,
+                    virtual_threads,
+                    [&](int ithr) {
+                        for_1d(ithr, virtual_threads, D0, func);
+                    },
+                    tbb::static_partitioner());
+            }
+        }
+#else
+        ov::parallel_for(D0, func);  // from core
+#endif
+    }
+
+    template <typename T0, typename T1, typename F>
+    void cpu_parallel_for2d(const T0& D0, const T1& D1, const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        auto work_amount = static_cast<int>(D0 * D1);
+        const int nthr = parallel_get_max_threads();
+        int virtual_threads = nthr;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier;
+        }
+        if (virtual_threads > work_amount) {
+            virtual_threads = work_amount;
+        }
+        if (virtual_threads == 1) {
+            for_2d(0, 1, D0, D1, func);
+        } else {
+            if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+                tbb::parallel_for(0, virtual_threads, [&](int ithr) {
+                    for_2d(ithr, virtual_threads, D0, D1, func);
+                });
+            } else {
+                tbb::parallel_for(
+                    0,
+                    virtual_threads,
+                    [&](int ithr) {
+                        for_2d(ithr, virtual_threads, D0, D1, func);
+                    },
+                    tbb::static_partitioner());
+            }
+        }
+#else
+        ov::parallel_for2d(D0, D1, func);
+#endif
+    }
+
+    template <typename T0, typename T1, typename T2, typename F>
+    void cpu_parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        auto work_amount = static_cast<int>(D0 * D1 * D2);
+        const int nthr = parallel_get_max_threads();
+        int virtual_threads = nthr;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier;
+        }
+        if (virtual_threads > work_amount) {
+            virtual_threads = work_amount;
+        }
+        if (virtual_threads == 1) {
+            for_3d(0, 1, D0, D1, D2, func);
+        } else {
+            if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+                tbb::parallel_for(0, virtual_threads, [&](int ithr) {
+                    for_3d(ithr, virtual_threads, D0, D1, D2, func);
+                });
+            } else {
+                tbb::parallel_for(
+                    0,
+                    virtual_threads,
+                    [&](int ithr) {
+                        for_3d(ithr, virtual_threads, D0, D1, D2, func);
+                    },
+                    tbb::static_partitioner());
+            }
+        }
+#else
+        ov::parallel_for3d(D0, D1, D2, func);
+#endif
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename F>
+    void cpu_parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        auto work_amount = static_cast<int>(D0 * D1 * D2 * D3);
+        const int nthr = parallel_get_max_threads();
+        int virtual_threads = nthr;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier;
+        }
+        if (virtual_threads > work_amount) {
+            virtual_threads = work_amount;
+        }
+        if (virtual_threads == 1) {
+            for_4d(0, 1, D0, D1, D2, D3, func);
+        } else {
+            if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+                tbb::parallel_for(0, virtual_threads, [&](int ithr) {
+                    for_4d(ithr, virtual_threads, D0, D1, D2, D3, func);
+                });
+            } else {
+                tbb::parallel_for(
+                    0,
+                    virtual_threads,
+                    [&](int ithr) {
+                        for_4d(ithr, virtual_threads, D0, D1, D2, D3, func);
+                    },
+                    tbb::static_partitioner());
+            }
+        }
+#else
+        ov::parallel_for4d(D0, D1, D2, D3, func);
+#endif
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
+    void cpu_parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        auto work_amount = static_cast<int>(D0 * D1 * D2 * D3 * D4);
+        const int nthr = parallel_get_max_threads();
+        int virtual_threads = nthr;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier;
+        }
+        if (virtual_threads > work_amount) {
+            virtual_threads = work_amount;
+        }
+        if (virtual_threads == 1) {
+            for_5d(0, 1, D0, D1, D2, D3, D4, func);
+        } else {
+            if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+                tbb::parallel_for(0, virtual_threads, [&](int ithr) {
+                    for_5d(ithr, virtual_threads, D0, D1, D2, D3, D4, func);
+                });
+            } else {
+                tbb::parallel_for(
+                    0,
+                    virtual_threads,
+                    [&](int ithr) {
+                        for_5d(ithr, virtual_threads, D0, D1, D2, D3, D4, func);
+                    },
+                    tbb::static_partitioner());
+            }
+        }
+#else
+        ov::parallel_for5d(D0, D1, D2, D3, D4, func);
+#endif
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename F>
+    void cpu_parallel_for6d(const T0& D0,
+                            const T1& D1,
+                            const T2& D2,
+                            const T3& D3,
+                            const T4& D4,
+                            const T5& D5,
+                            const F& func) const {
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+        auto work_amount = static_cast<int>(D0 * D1 * D2 * D3 * D4 * D5);
+        const int nthr = parallel_get_max_threads();
+        int virtual_threads = nthr;
+        if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+            virtual_threads = 1 == nthr ? 1 : nthr * m_multiplier;
+        }
+        if (virtual_threads > work_amount) {
+            virtual_threads = work_amount;
+        }
+        if (virtual_threads == 1) {
+            for_6d(0, 1, D0, D1, D2, D3, D4, D5, func);
+        } else {
+            if (m_partitioner == ov::intel_cpu::TbbPartitioner::AUTO) {
+                tbb::parallel_for(0, virtual_threads, [&](int ithr) {
+                    for_6d(ithr, virtual_threads, D0, D1, D2, D3, D4, D5, func);
+                });
+            } else {
+                tbb::parallel_for(
+                    0,
+                    virtual_threads,
+                    [&](int ithr) {
+                        for_6d(ithr, virtual_threads, D0, D1, D2, D3, D4, D5, func);
+                    },
+                    tbb::static_partitioner());
+            }
+        }
+#else
+        ov::parallel_for6d(D0, D1, D2, D3, D4, D5, func);
+#endif
+    }
+
+    ov::intel_cpu::TbbPartitioner m_partitioner = ov::intel_cpu::TbbPartitioner::STATIC;
+    size_t m_multiplier = default_multiplier;
+    std::shared_ptr<ThreadPool> m_thread_pool = nullptr;
+};
+
+}  // namespace ov::intel_cpu
diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
index 6b14e38d2ae015..9825f4ce948f60 100644
--- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
+++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
@@ -19,6 +19,7 @@
 #include "openvino/core/any.hpp"
 #include "openvino/core/except.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
 #include "openvino/runtime/properties.hpp"
 #include "openvino/runtime/system_conf.hpp"
 
@@ -34,7 +35,6 @@
 #include "openvino/op/fake_quantize.hpp"
 #include "openvino/runtime/threading/cpu_streams_info.hpp"
 #include "openvino/runtime/threading/istreams_executor.hpp"
-#include "transformations/utils.hpp"
 #include "transformations/utils/utils.hpp"
 #include "utils/general_utils.h"
 
@@ -46,7 +46,7 @@ constexpr int TP_CPU_LIMIT = 32;
 
 namespace ov::intel_cpu {
 
-void sort_table_by_numa_node_id(const int current_numa_node, std::vector<std::vector<int>>& proc_type_table) {
+void sort_table_by_numa_node_id(int current_numa_node, std::vector<std::vector<int>>& proc_type_table) {
     if (proc_type_table.size() > 1) {
         for (size_t i = 1; i < proc_type_table.size(); i++) {
             if (current_numa_node == proc_type_table[i][PROC_NUMA_NODE_ID]) {
@@ -608,14 +608,39 @@ int get_model_prefer_threads(const int num_streams,
                              const std::vector<std::vector<int>>& proc_type_table,
                              const std::shared_ptr<ov::Model>& model,
                              Config& config) {
+    bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model);
+
+    auto default_prefer_threads_latency = [&]() {
+        bool llm_related = ov::op::util::is_large_language_model(*model);
+        const int int8_threshold = 4;  // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
+        const int fp32_threshold = 2;  // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
+        // By default the latency case uses (faster) Big cores only, depending on the compute ratio
+        // But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big
+        // cores only cases except LLM.
+        bool use_all_cores =
+            proc_type_table[0][MAIN_CORE_PROC] <= (proc_type_table[0][EFFICIENT_CORE_PROC] /
+                                                   (int8_intensive || llm_related ? int8_threshold : fp32_threshold));
+        bool use_big_and_little = !llm_related && (ov::get_number_of_blocked_cores() != 0);
+
+        if (use_all_cores || use_big_and_little) {
+            config.modelPreferThreadsLatency =
+                proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
+        } else {
+            config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC];
+        }
+        return;
+    };
+
     const int sockets = get_num_sockets();
-    auto model_prefer = 0;
+
     if (-1 == config.modelPreferThreads) {
+        config.modelPreferThreads = 0;
 #if (defined(OPENVINO_ARCH_ARM64) && defined(__linux__))
-        config.modelPreferThreads = 8;
+        config.modelPreferThreadsThroughput = 8;
         if (dnnl::impl::cpu::aarch64::mayiuse(dnnl::impl::cpu::aarch64::cpu_isa_t::sve_128)) {
-            config.modelPreferThreads = 16;
+            config.modelPreferThreadsThroughput = 16;
         }
+        default_prefer_threads_latency();
 #else
         const auto isa = dnnl::get_effective_cpu_isa();
         float isaSpecificThreshold = 1.0F;
@@ -648,96 +673,180 @@ int get_model_prefer_threads(const int num_streams,
                                                  config.inferencePrecision);
 
 #    if (defined(OPENVINO_ARCH_ARM) && defined(__linux__))
-        config.modelPreferThreads = 4;
+        config.modelPreferThreadsThroughput = 4;
         if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
             if (networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) {
-                config.modelPreferThreads = 8;
+                config.modelPreferThreadsThroughput = 8;
             }
         } else if ((networkToleranceForLowCache.max_mem_tolerance < ov::MemBandwidthPressure::LIMITED) &&
                    ((networkToleranceForLowCache.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED) ||
                     (networkToleranceForLowCache.ratio_mem_limited_gemms > ov::MemBandwidthPressure::LIMITED))) {
-            config.modelPreferThreads = 8;
+            config.modelPreferThreadsThroughput = 8;
+        }
+        default_prefer_threads_latency();
+
+#    elif (defined(OPENVINO_ARCH_ARM) && defined(__APPLE__))
+        if ((proc_type_table.size() == 1) && (proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) {
+            config.modelPreferThreadsLatency =
+                proc_type_table[0][MAIN_CORE_PROC] > proc_type_table[0][EFFICIENT_CORE_PROC]
+                    ? proc_type_table[0][MAIN_CORE_PROC]
+                    : proc_type_table[0][ALL_PROC];
+        } else {
+            default_prefer_threads_latency();
         }
-#    elif ((defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__))
-        config.modelPreferThreads = 1;
+        config.modelPreferThreadsThroughput = 1;
         if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
             if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) ||
                 (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) {
                 // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams
-                config.modelPreferThreads = 4;
+                config.modelPreferThreadsThroughput = 4;
             }  // otherwise (no recognized layers) falling back to the default value
         } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) {
             // network is below the ISA-specific threshold
-            config.modelPreferThreads = 1;
+            config.modelPreferThreadsThroughput = 1;
         } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) {
             // network is below general threshold
-            config.modelPreferThreads = 1;
+            config.modelPreferThreadsThroughput = 1;
         } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED &&
                    networkToleranceForLowCache.ratio_compute_convs < ov::MemBandwidthPressure::ALL) {
-            config.modelPreferThreads = 4;
+            config.modelPreferThreadsThroughput = 4;
         } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs <= ov::MemBandwidthPressure::LIMITED &&
                    networkToleranceForLowCache.ratio_mem_limited_convs <= ov::MemBandwidthPressure::LIMITED &&
                    networkToleranceForLowCache.ratio_compute_convs > ov::MemBandwidthPressure::LIMITED) {
-            config.modelPreferThreads = 2;
+            config.modelPreferThreadsThroughput = 2;
         }
 #    else
-        config.modelPreferThreads = 0;
+        if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) {
+            if ((proc_type_table[0][MAIN_CORE_PROC] < config.threads || config.threads == 0) &&
+                (ov::get_number_of_blocked_cores() || proc_type_table[0][LP_EFFICIENT_CORE_PROC] > 0) &&
+                proc_type_table[0][EFFICIENT_CORE_PROC] <= 2 * proc_type_table[0][MAIN_CORE_PROC]) {
+                if (ov::op::util::is_large_language_model(*model)) {
+                    config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC];
+                } else {
+                    config.modelPreferThreadsLatency =
+                        proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
+                    if (config.tbbPartitioner == TbbPartitioner::NONE) {
+                        if (proc_type_table[0][LP_EFFICIENT_CORE_PROC] > 0 && int8_intensive &&
+                            networkToleranceForLowCache.total_convs > 0) {
+                            bool main_core_case_1 = networkToleranceForLowCache.ratio_mem_limited_convs > 0.8F;
+                            bool main_core_case_2 = networkToleranceForLowCache.ratio_mem_limited_convs == 0.0F &&
+                                                    networkToleranceForLowCache.ratio_compute_convs == 0.0F &&
+                                                    networkToleranceForLowCache.max_mem_tolerance >= 4.5F;
+                            bool main_core_case_3 =
+                                networkToleranceForLowCache.ratio_mem_limited_convs == 0.0F &&
+                                networkToleranceForLowCache.ratio_compute_convs > 0.0F &&
+                                networkToleranceForLowCache.ratio_compute_convs < 1.0F &&
+                                static_cast<float>(networkToleranceForLowCache.total_light_convs) >
+                                    0.9F * static_cast<float>(networkToleranceForLowCache.total_convs);
+                            bool main_core_case_4 =
+                                networkToleranceForLowCache.ratio_mem_limited_convs > 0.0F &&
+                                networkToleranceForLowCache.ratio_compute_convs > 0.0F &&
+                                static_cast<float>(networkToleranceForLowCache.total_light_convs) >
+                                    0.46F * static_cast<float>(networkToleranceForLowCache.total_convs);
+                            if (main_core_case_1 || main_core_case_2 || main_core_case_3 || main_core_case_4) {
+                                config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC];
+                                config.tbbPartitioner = TbbPartitioner::STATIC;
+                            }
+                        }
+                        if (config.tbbPartitioner == TbbPartitioner::NONE) {
+                            bool static_case_1 = networkToleranceForLowCache.total_nodes == 0;
+                            bool static_case_2 = networkToleranceForLowCache.total_convs > 0 &&
+                                                 static_cast<float>(networkToleranceForLowCache.total_light_convs) >
+                                                     0.6F * static_cast<float>(networkToleranceForLowCache.total_convs);
+                            bool static_case_3 = false;
+                            bool static_case_4 = false;
+                            bool static_case_5 = false;
+                            if (proc_type_table[0][LP_EFFICIENT_CORE_PROC] > 0) {
+                                static_case_3 =
+                                    networkToleranceForLowCache.total_convs > 0 &&
+                                    static_cast<float>(networkToleranceForLowCache.total_light_convs) <=
+                                        0.6F * static_cast<float>(networkToleranceForLowCache.total_convs) &&
+                                    networkToleranceForLowCache.ratio_compute_convs +
+                                            networkToleranceForLowCache.ratio_mem_limited_convs <
+                                        0.9F &&
+                                    networkToleranceForLowCache.ratio_mem_limited_convs < 0.2F &&
+                                    networkToleranceForLowCache.ratio_mem_limited_gemms == 0.0F &&
+                                    ((networkToleranceForLowCache.ratio_mem_limited_adds < 0.28F &&
+                                      networkToleranceForLowCache.max_mem_tolerance >= 0.06F) ||
+                                     networkToleranceForLowCache.ratio_compute_convs == 0 ||
+                                     networkToleranceForLowCache.ratio_mem_limited_convs == 0);
+                                static_case_4 =
+                                    networkToleranceForLowCache.total_convs == 0 &&
+                                    (networkToleranceForLowCache.max_mem_tolerance > 2.5F ||
+                                     static_cast<float>(networkToleranceForLowCache.total_gemms) >=
+                                         0.14F * static_cast<float>(networkToleranceForLowCache.total_nodes));
+                                static_case_5 =
+                                    networkToleranceForLowCache.total_convs > 0 &&
+                                    static_cast<float>(networkToleranceForLowCache.total_light_convs) <=
+                                        0.6F * static_cast<float>(networkToleranceForLowCache.total_convs) &&
+                                    networkToleranceForLowCache.ratio_compute_convs >=
+                                        0.9F * networkToleranceForLowCache.ratio_mem_limited_convs &&
+                                    networkToleranceForLowCache.ratio_compute_convs == 1.0F &&
+                                    networkToleranceForLowCache.ratio_mem_limited_adds == 1.0F &&
+                                    static_cast<float>(networkToleranceForLowCache.total_heavy_convs) >
+                                        0.1F * static_cast<float>(networkToleranceForLowCache.total_nodes);
+                            } else {
+                                static_case_3 =
+                                    networkToleranceForLowCache.total_convs > 0 &&
+                                    static_cast<float>(networkToleranceForLowCache.total_light_convs) <=
+                                        0.6F * static_cast<float>(networkToleranceForLowCache.total_convs) &&
+                                    networkToleranceForLowCache.ratio_compute_convs +
+                                            networkToleranceForLowCache.ratio_mem_limited_convs <
+                                        0.9F &&
+                                    networkToleranceForLowCache.ratio_mem_limited_convs < 0.2F &&
+                                    networkToleranceForLowCache.ratio_mem_limited_gemms == 0.0F &&
+                                    networkToleranceForLowCache.ratio_mem_limited_adds < 0.28F &&
+                                    networkToleranceForLowCache.max_mem_tolerance >= 0.06F;
+                                static_case_4 = networkToleranceForLowCache.total_convs == 0 &&
+                                                static_cast<float>(networkToleranceForLowCache.total_gemms) <
+                                                    0.05F * static_cast<float>(networkToleranceForLowCache.total_nodes);
+                            }
+                            if (static_case_1 || static_case_2 || static_case_3 || static_case_4 || static_case_5) {
+                                config.tbbPartitioner = TbbPartitioner::STATIC;
+                            } else {
+                                config.tbbPartitioner = TbbPartitioner::AUTO;
+                            }
+                        }
+                    }
+                }
+            } else {
+                default_prefer_threads_latency();
+            }
+        } else {
+            config.modelPreferThreadsLatency = proc_type_table[0][MAIN_CORE_PROC] > 0
+                                                   ? proc_type_table[0][MAIN_CORE_PROC]
+                                                   : proc_type_table[0][EFFICIENT_CORE_PROC];
+        }
+        config.modelPreferThreadsThroughput = 0;
         if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
             if (any_of(ov::MemBandwidthPressure::ALL,
                        networkToleranceForLowCache.ratio_compute_convs,
                        networkToleranceForLowCache.ratio_compute_deconvs)) {
                 // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams
-                config.modelPreferThreads = 1;
+                config.modelPreferThreadsThroughput = 1;
             }  // otherwise (no recognized layers) falling back to the default value
         } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) {
             // network is below the ISA-specific threshold
-            config.modelPreferThreads = 1;
+            config.modelPreferThreadsThroughput = 1;
         } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) {
             // network is below general threshold
-            config.modelPreferThreads = 2;
+            config.modelPreferThreadsThroughput = 2;
         }
-        if (config.modelPreferThreads == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 &&
+        if (config.modelPreferThreadsThroughput == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 &&
             (proc_type_table[0][HYPER_THREADING_PROC] == proc_type_table[0][MAIN_CORE_PROC])) {
-            config.modelPreferThreads = 2;
+            config.modelPreferThreadsThroughput = 2;
         }
 #    endif
 #endif
     }
 
-    // latency
-    if (num_streams <= sockets && num_streams > 0) {
-        if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) {
-#ifdef __APPLE__
-            if ((proc_type_table.size() == 1) && (proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) {
-                model_prefer = proc_type_table[0][MAIN_CORE_PROC] > proc_type_table[0][EFFICIENT_CORE_PROC]
-                                   ? proc_type_table[0][MAIN_CORE_PROC]
-                                   : proc_type_table[0][ALL_PROC];
-            }
-#else
-            bool llm_related = has_matmul_with_compressed_weights(model);
-            bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model) || llm_related;
-            const int int8_threshold = 4;  // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
-            const int fp32_threshold = 2;  // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
-            // By default the latency case uses (faster) Big cores only, depending on the compute ratio
-            // But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big
-            // cores only cases except LLM.
-            bool use_all_cores =
-                proc_type_table[0][MAIN_CORE_PROC] <=
-                (proc_type_table[0][EFFICIENT_CORE_PROC] / (int8_intensive ? int8_threshold : fp32_threshold));
-            bool use_big_and_little = !llm_related && (ov::get_number_of_blocked_cores() != 0);
-
-            if (use_all_cores || use_big_and_little) {
-                model_prefer = proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
-            } else {
-                model_prefer = proc_type_table[0][MAIN_CORE_PROC];
-            }
-#endif
-        }
-    } else {  // throughput
-        model_prefer = config.modelPreferThreads;
+    if (num_streams > sockets || num_streams == 0) {
+        config.modelPreferThreads = config.modelPreferThreadsThroughput;
+    } else {
+        config.modelPreferThreads = config.modelPreferThreadsLatency;
     }
 
-    return model_prefer;
+    return config.modelPreferThreads;
 }
 
 std::vector<std::vector<int>> generate_stream_info(const int streams,
@@ -760,7 +869,7 @@ std::vector<std::vector<int>> generate_stream_info(const int streams,
     }
 
     if (proc_type_table.size() > 1) {
-        const auto cur_numa_node_id = input_numa_node_id < 0 ? get_current_numa_node_id() : input_numa_node_id;
+        int cur_numa_node_id = input_numa_node_id < 0 ? get_current_numa_node_id() : input_numa_node_id;
         sort_table_by_numa_node_id(cur_numa_node_id, proc_type_table);
     }
     OPENVINO_ASSERT(!proc_type_table.empty() && proc_type_table[0][ALL_PROC] != 0,
@@ -774,6 +883,8 @@ std::vector<std::vector<int>> generate_stream_info(const int streams,
                                                      ov::util::to_string(config.hintPerfMode),
                                                      config.modelDistributionPolicy,
                                                      proc_type_table);
+    config.tbbPartitioner =
+        config.tbbPartitioner == TbbPartitioner::NONE ? TbbPartitioner::STATIC : config.tbbPartitioner;
     OPENVINO_ASSERT(!streams_info_table.empty(), "streams_info_table is empty!");
     if (config.modelDistributionPolicy.find(ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL) !=
         config.modelDistributionPolicy.end()) {
diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp
index 5c8a890eba740f..01f713a899fa25 100644
--- a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp
+++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp
@@ -117,5 +117,4 @@ void get_num_streams(int streams, const std::shared_ptr<ov::Model>& model, Confi
  * @param[in]  proc_type_table summary table of number of processors per type
  */
 void sort_table_by_numa_node_id(int current_numa_node, std::vector<std::vector<int>>& proc_type_table);
-
 }  // namespace ov::intel_cpu
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
index 8620ced8a5ae92..167a14d65decb2 100644
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -69,17 +69,19 @@
 #include "openvino/runtime/so_ptr.hpp"
 #include "perf_count.h"
 #include "proxy_mem_blk.h"
+#include "thread_pool_imp.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/general_utils.h"
 #include "utils/node_dumper.h"
 #include "utils/verbose.h"
 #include "weights_cache.hpp"
 
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_OMP)
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE || \
+     OV_THREAD == OV_THREAD_OMP)
 #    include <atomic>
 #endif
 
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#if OV_THREAD_USE_TBB
 #    include <tbb/task.h>
 #endif
 
@@ -121,7 +123,8 @@ void Graph::Init(const std::vector<NodePtr>& graphNodes,
     }
 
     m_context = context;
-    m_stream = dnnl::stream(getEngine());
+    m_stream = make_stream(getEngine(), m_context->getCpuParallel()->get_thread_pool());
+    m_context->getCpuParallel()->activate();
 
     this->_name = std::move(name);
 
@@ -377,7 +380,8 @@ void Graph::Init(const std::shared_ptr<const ov::Model>& model,
     }
 
     m_context = context;
-    m_stream = dnnl::stream(getEngine());
+    m_stream = make_stream(getEngine(), m_context->getCpuParallel()->get_thread_pool());
+    m_context->getCpuParallel()->activate();
 
     Replicate(model, inputConfigs, outputConfigs);
 
@@ -1385,7 +1389,8 @@ class UpdateNodesSeq {
 using UpdateNodes = UpdateNodesSeq;
 #endif
 
-#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_OMP)
+#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_TBB_ADAPTIVE || \
+     OV_THREAD == OV_THREAD_OMP)
 
 class UpdateNodesBase {
 public:
@@ -1432,7 +1437,7 @@ class UpdateNodesBase {
 };
 
 // NOLINTBEGIN(misc-include-cleaner) tbb has multiple implicit includes, which are not supposed to be included directly
-#    if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
+#    if OV_THREAD_USE_TBB
 #        if (TBB_VERSION_MAJOR > 2020)
 template <typename Body>
 class AsyncTask : public tbb::detail::d1::task {
diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp
index 458f0dddf418be..60a74e816fcc5e 100644
--- a/src/plugins/intel_cpu/src/graph_context.cpp
+++ b/src/plugins/intel_cpu/src/graph_context.cpp
@@ -10,6 +10,7 @@
 
 #include "cache/multi_cache.h"
 #include "config.h"
+#include "cpu_parallel.hpp"
 #include "dnnl_scratch_pad.h"
 #include "memory_control.hpp"
 #include "nodes/memory.hpp"
@@ -25,6 +26,7 @@ GraphContext::GraphContext(Config config,
                            WeightsSharing::Ptr w_cache,
                            bool isGraphQuantized,
                            ov::threading::IStreamsExecutor::Ptr streamExecutor,
+                           std::shared_ptr<CpuParallel> cpuParallel,
                            std::shared_ptr<SubMemoryManager> sub_memory_manager)
     : m_config(std::move(config)),
       m_weightsCache(std::move(w_cache)),
@@ -32,6 +34,7 @@ GraphContext::GraphContext(Config config,
       m_snippetsParamsCache(std::make_shared<MultiCache>(m_config.snippetsCacheCapacity)),
       m_isGraphQuantizedFlag(isGraphQuantized),
       m_streamExecutor(std::move(streamExecutor)),
+      m_cpuParallel(std::move(cpuParallel)),
       m_subMemoryManager(std::move(sub_memory_manager)),
 
       m_memoryStatesRegister(std::make_shared<node::MemoryStatesRegister>()),
@@ -51,6 +54,10 @@ GraphContext::GraphContext(Config config,
     for (int i = 0; i < numaNum; i++) {
         m_rtScratchPads.push_back(std::make_shared<DnnlScratchPad>(getEngine(), i));
     }
+
+    if (!m_cpuParallel) {
+        m_cpuParallel = std::make_shared<CpuParallel>(m_config.tbbPartitioner);
+    }
 }
 
 const dnnl::engine& GraphContext::getEngine() {
diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h
index 254762ca1c9ea7..656e25f810ea3c 100644
--- a/src/plugins/intel_cpu/src/graph_context.h
+++ b/src/plugins/intel_cpu/src/graph_context.h
@@ -10,6 +10,7 @@
 
 #include "cache/multi_cache.h"
 #include "config.h"
+#include "cpu_parallel.hpp"
 #include "dnnl_scratch_pad.h"
 #include "memory_control.hpp"
 #include "openvino/runtime/threading/cpu_streams_executor.hpp"
@@ -35,6 +36,7 @@ class GraphContext {
                  WeightsSharing::Ptr w_cache,
                  bool isGraphQuantized,
                  ov::threading::IStreamsExecutor::Ptr streamExecutor = nullptr,
+                 std::shared_ptr<CpuParallel> cpuParallel = nullptr,
                  std::shared_ptr<SubMemoryManager> sub_memory_manager = nullptr);
 
     [[nodiscard]] const Config& getConfig() const {
@@ -71,6 +73,10 @@ class GraphContext {
         return m_cpuStreamExecutor;
     }
 
+    [[nodiscard]] std::shared_ptr<CpuParallel> getCpuParallel() const {
+        return m_cpuParallel;
+    }
+
     [[nodiscard]] std::shared_ptr<SubMemoryManager> getSubMemory() const {
         return m_subMemoryManager;
     }
@@ -121,6 +127,7 @@ class GraphContext {
     ov::threading::IStreamsExecutor::Ptr m_streamExecutor;
     // cpu stream executor for current graph
     ov::threading::CPUStreamsExecutor::Ptr m_cpuStreamExecutor;
+    std::shared_ptr<CpuParallel> m_cpuParallel = nullptr;
     // numa submemory manager
     std::shared_ptr<SubMemoryManager> m_subMemoryManager;
 
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
index 680444a7d74554..86a71ff4b6a941 100644
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -811,6 +811,7 @@ void Node::updateDynamicParams() {
                           getName(),
                           " ",
                           getOriginalLayers());
+                context->getCpuParallel()->activate();
                 prepareParams();
             }
         }
@@ -1116,7 +1117,10 @@ void Node::prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx) {
         Memory memory{engine, newDesc, internalBlob->getData()};
 
         MemoryPtr _ptr = std::make_shared<Memory>(engine, intDesc);
-        node::Reorder::reorderData(memory, *_ptr, context->getParamsCache());
+        node::Reorder::reorderData(memory,
+                                   *_ptr,
+                                   context->getParamsCache(),
+                                   context->getCpuParallel()->get_thread_pool());
         return _ptr;
     };
 
@@ -1150,7 +1154,10 @@ MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryD
     auto create = [&]() {
         Memory srcMemory{getEngine(), srcWeightDesc, edgeMem->getData()};
         MemoryPtr _ptr = std::make_shared<Memory>(getEngine(), dstWeightDesc);
-        node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
+        node::Reorder::reorderData(srcMemory,
+                                   *_ptr,
+                                   context->getParamsCache(),
+                                   context->getCpuParallel()->get_thread_pool());
 
         return _ptr;
     };
diff --git a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp
index 0d30084da1586a..a8699c1ab3138e 100644
--- a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp
+++ b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp
@@ -25,7 +25,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/adaptive_avg_pool.hpp"
@@ -139,6 +138,7 @@ void AdaptivePooling::executeDynamicImpl(const dnnl::stream& strm) {
 }
 
 void AdaptivePooling::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto inputPrec = getParentEdgeAt(0)->getMemory().getDataType();
     auto outputPrec = getChildEdgeAt(0)->getMemory().getDataType();
     CPU_NODE_ASSERT(inputPrec == dnnl_f32 && outputPrec == dnnl_f32, "doesn't support demanded precisions");
@@ -264,7 +264,7 @@ void AdaptivePooling::execute([[maybe_unused]] const dnnl::stream& strm) {
         pool = poolAvg;
     }
 
-    parallel_for5d(N, blockCount, OD, OH, OW, [&](int n, int blkIdx, int od, int oh, int ow) {
+    cpu_parallel->parallel_for5d(N, blockCount, OD, OH, OW, [&](int n, int blkIdx, int od, int oh, int ow) {
         const auto* srcData = src + n * inStrides[0] + blkIdx * inStrides[1];
         auto* dstData = dst + n * outStrides[0] + blkIdx * outStrides[1] + od * outStrides[2] + oh * outStrides[3] +
                         ow * outStrides[4];
diff --git a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp
index 84f20afce5b105..100724d5aa86d5 100644
--- a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp
@@ -33,7 +33,6 @@
 #include "openvino/core/enum_names.hpp"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/binary_convolution.hpp"
@@ -1270,6 +1269,7 @@ void BinaryConvolution::executeOptimized(const uint8_t* src,
                                          const std::vector<size_t>& s_str,
                                          const std::vector<size_t>& w_str,
                                          const std::vector<size_t>& d_str) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto* dst_f32 = reinterpret_cast<float*>(dst);
 
     const int MB = jcp.mb;
@@ -1277,7 +1277,7 @@ void BinaryConvolution::executeOptimized(const uint8_t* src,
     int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking);
     int nbits = 8;
 
-    parallel_for4d(MB, jcp.ngroups, ocb_work, jcp.oh, [&](int n, int g, int ocbb, int oh) {
+    cpu_parallel->parallel_for4d(MB, jcp.ngroups, ocb_work, jcp.oh, [&](int n, int g, int ocbb, int oh) {
         int ocb = ocbb * jcp.nb_oc_blocking;
         int ocb_num = jcp.nb_oc_blocking;
 
@@ -1326,6 +1326,7 @@ void BinaryConvolution::executeReference(const uint8_t* src,
                                          const std::vector<size_t>& s_str,
                                          const std::vector<size_t>& w_str,
                                          const std::vector<size_t>& d_str) const {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto* dst_fp = reinterpret_cast<float*>(dst);
 
     const bool with_groups = jcp.ngroups > 1;
@@ -1393,7 +1394,7 @@ void BinaryConvolution::executeReference(const uint8_t* src,
         }
     };
 
-    parallel_for5d(G, MB, OC, OH, OW, [&](int g, int mb, int oc, int oh, int ow) {
+    cpu_parallel->parallel_for5d(G, MB, OC, OH, OW, [&](int g, int mb, int oc, int oh, int ow) {
         int32_t a = 0;
         ker(a, g, mb, oc, oh, ow);
 
diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.cpp b/src/plugins/intel_cpu/src/nodes/bucketize.cpp
index 77e5b68809c732..1b7acd45d72ade 100644
--- a/src/plugins/intel_cpu/src/nodes/bucketize.cpp
+++ b/src/plugins/intel_cpu/src/nodes/bucketize.cpp
@@ -22,7 +22,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/core/type/element_type_traits.hpp"
@@ -232,6 +231,7 @@ void Bucketize::bucketize() {
     const auto* input_data = getSrcDataAtPortAs<const T>(0);
     const auto* boundaries_data = getSrcDataAtPortAs<const T_BOUNDARIES>(1);
     auto* output_data = getDstDataAtPortAs<T_IND>(0);
+    const auto& cpu_parallel = context->getCpuParallel();
 
     if (!with_bins) {
         memset(output_data, 0, num_values * sizeof(T_IND));
@@ -239,7 +239,7 @@ void Bucketize::bucketize() {
     }
 
     // boundaries are assumed to be sorted and to have unique elements
-    parallel_for(num_values, [&](size_t ind) {
+    cpu_parallel->parallel_for(num_values, [&](size_t ind) {
         T value = input_data[ind];
         if (with_right) {
             const auto* low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value);
diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp
index 4c2b6320a01bc9..15826f3d7a2ab7 100644
--- a/src/plugins/intel_cpu/src/nodes/concat.cpp
+++ b/src/plugins/intel_cpu/src/nodes/concat.cpp
@@ -605,6 +605,7 @@ void Concat::exec1DCase() {
 }
 
 void Concat::execNspcSpecCase() {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto& dst_memory = getChildEdgeAt(0)->getMemory();
     const size_t num_src = getParentEdges().size();
     auto* dst_ptr = dst_memory.getDataAs<uint8_t>();
@@ -638,7 +639,7 @@ void Concat::execNspcSpecCase() {
     const Shape& shape = getSrcMemoryAtPort(firstNonZeroEdge)->getShape();
     const size_t iter_count = shape.getElementsCount() / shape.getStaticDims()[channelAxis];
 
-    parallel_for(iter_count, [&](int i) {
+    cpu_parallel->parallel_for(iter_count, [&](int i) {
         const size_t dst_off = i * channels_size;
         for (size_t j = 0; j < nonZeroInShapes; j++) {
             cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channelsDataSize[j], channelsDataSize[j]);
@@ -647,6 +648,7 @@ void Concat::execNspcSpecCase() {
 }
 
 void Concat::execRef() {
+    const auto& cpu_parallel = context->getCpuParallel();
     const size_t numSrc = getParentEdges().size();
     const auto& dstMemory = getChildEdgeAt(0)->getMemory();
     auto* dstPtr = dstMemory.getDataAs<uint8_t>();
@@ -695,65 +697,65 @@ void Concat::execRef() {
         }
         const auto L1Size = dnnl::utils::get_cache_size(1, true);
         UNUSED(L1Size);  // for Windows
-        parallel_for6d(physDims[0],
-                       physDims[1],
-                       physDims[2],
-                       physDims[3],
-                       physDims[4],
-                       numSrc,
-                       [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) {
-                           // check if zero memory
-                           if (srcPtrs[a] == nullptr) {
-                               return;
-                           }
-
-                           size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 +
-                                          inputStrides[a][3] * n3 + inputStrides[a][4] * n4;
-                           size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 +
-                                           outputStrides[3] * n3 + outputStrides[4] * n4;
-                           const uint8_t* i = &srcPtrs[a][inOff];
-                           uint8_t* o = &dstPtr[dstOffset[a] + outOff];
+        cpu_parallel->parallel_for6d(
+            physDims[0],
+            physDims[1],
+            physDims[2],
+            physDims[3],
+            physDims[4],
+            numSrc,
+            [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) {
+                // check if zero memory
+                if (srcPtrs[a] == nullptr) {
+                    return;
+                }
+
+                size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 +
+                               inputStrides[a][3] * n3 + inputStrides[a][4] * n4;
+                size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 +
+                                outputStrides[3] * n3 + outputStrides[4] * n4;
+                const uint8_t* i = &srcPtrs[a][inOff];
+                uint8_t* o = &dstPtr[dstOffset[a] + outOff];
 
 #if defined(__GNUC__)
-                           // Heuristic:
-                           // memcpy works generally faster for data sizes not
-                           // exceeding L1 cache.
-                           if (nelemToCopy[a] > L1Size) {
-                               // The code below performs data copying: o[e] = i[e]
-                               // and uses a workaround to make GNU compilers optimize it
-                               uint8_t* ptro = o;
-                               const uint8_t* ptri = i;
-                               // head part: bytes before 4 byte-align's address
-                               const size_t headPart =
-                                   sizeof(uint32_t) - reinterpret_cast<uint64_t>(ptro) % sizeof(uint32_t);
-
-                               // main part: bytes in 4 byte-align
-                               const size_t mainPart = (nelemToCopy[a] - headPart) / sizeof(uint32_t);
-                               // tail part: bytes after 4 byte-align
-                               const size_t tailPart = (nelemToCopy[a]) - headPart - (mainPart * sizeof(uint32_t));
-                               // copy head part
-                               for (size_t e = 0; e < headPart; ++e) {
-                                   *ptro = *ptri;
-                                   ++ptro;
-                                   ++ptri;
-                               }
-                               // copy main part
-                               std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t));
-                               ptro += mainPart * sizeof(uint32_t);
-                               ptri += mainPart * sizeof(uint32_t);
-                               // copy tail part
-                               for (size_t e = 0; e < tailPart; ++e) {
-                                   *ptro = *ptri;
-                                   ++ptro;
-                                   ++ptri;
-                               }
-                           } else {
-                               std::memcpy(o, i, nelemToCopy[a]);
-                           }
+                // Heuristic:
+                // memcpy works generally faster for data sizes not
+                // exceeding L1 cache.
+                if (nelemToCopy[a] > L1Size) {
+                    // The code below performs data copying: o[e] = i[e]
+                    // and uses a workaround to make GNU compilers optimize it
+                    uint8_t* ptro = o;
+                    const uint8_t* ptri = i;
+                    // head part: bytes before 4 byte-align's address
+                    const size_t headPart = sizeof(uint32_t) - reinterpret_cast<uint64_t>(ptro) % sizeof(uint32_t);
+
+                    // main part: bytes in 4 byte-align
+                    const size_t mainPart = (nelemToCopy[a] - headPart) / sizeof(uint32_t);
+                    // tail part: bytes after 4 byte-align
+                    const size_t tailPart = (nelemToCopy[a]) - headPart - (mainPart * sizeof(uint32_t));
+                    // copy head part
+                    for (size_t e = 0; e < headPart; ++e) {
+                        *ptro = *ptri;
+                        ++ptro;
+                        ++ptri;
+                    }
+                    // copy main part
+                    std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t));
+                    ptro += mainPart * sizeof(uint32_t);
+                    ptri += mainPart * sizeof(uint32_t);
+                    // copy tail part
+                    for (size_t e = 0; e < tailPart; ++e) {
+                        *ptro = *ptri;
+                        ++ptro;
+                        ++ptri;
+                    }
+                } else {
+                    std::memcpy(o, i, nelemToCopy[a]);
+                }
 #else
-            std::memcpy(o, i, nelemToCopy[a]);
+                std::memcpy(o, i, nelemToCopy[a]);
 #endif
-                       });
+            });
     }
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp
index 44103f5e115a87..198ec287473104 100644
--- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp
+++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp
@@ -90,6 +90,7 @@ void CTCGreedyDecoder::execute([[maybe_unused]] const dnnl::stream& strm) {
     const auto* probabilities = getSrcDataAtPortAs<const float>(DATA_INDEX);
     const auto* sequenceMask = getSrcDataAtPortAs<const float>(SEQUENCE_LENGTH_INDEX);
     auto* outputSequences = getDstDataAtPortAs<float>(0);
+    const auto& cpu_parallel = context->getCpuParallel();
 
     const size_t T = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0];
     const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[1];
@@ -100,7 +101,7 @@ void CTCGreedyDecoder::execute([[maybe_unused]] const dnnl::stream& strm) {
     const int blankIndex = C - 1;
 
     std::vector<size_t> sequenceLengths(B, 0);
-    parallel_for(B, [&](size_t b) {
+    cpu_parallel->parallel_for(B, [&](size_t b) {
         size_t t = 0;
         for (; t < T; t++) {
             if (sequenceMask[B * t + b] == 0.F) {
@@ -168,7 +169,7 @@ void CTCGreedyDecoder::execute([[maybe_unused]] const dnnl::stream& strm) {
 
     parallel_nt(0, threadBody);
 
-    parallel_for(B, [&](size_t b) {
+    cpu_parallel->parallel_for(B, [&](size_t b) {
         float prevClassIdx = -1.0F;
         size_t outputIndex = b * T;
         const size_t sequenceLength = sequenceLengths[b];
diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp
index 34b60583f45226..2085e20c5ac919 100644
--- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp
+++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp
@@ -95,6 +95,7 @@ void CTCGreedyDecoderSeqLen::execute([[maybe_unused]] const dnnl::stream& strm)
     const auto* sequenceLengths = getSrcDataAtPortAs<const int>(SEQUENCE_LENGTH_INDEX);
     auto* decodedClasses = getDstDataAtPortAs<int>(DECODED_CLASSES_INDEX);
     auto* decodedClassesLength = getDstDataAtPortAs<int>(DECODED_CLASSES_LENGTH_INDEX);
+    const auto& cpu_parallel = context->getCpuParallel();
 
     const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0];
     ;
@@ -169,7 +170,7 @@ void CTCGreedyDecoderSeqLen::execute([[maybe_unused]] const dnnl::stream& strm)
 
     parallel_nt(0, threadBody);
 
-    parallel_for(B, [&](size_t b) {
+    cpu_parallel->parallel_for(B, [&](size_t b) {
         int prevClassIdx = -1;
         size_t outputIndex = b * T;
         const size_t actualSeqLen = sequenceLengths[b];
diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.cpp b/src/plugins/intel_cpu/src/nodes/def_conv.cpp
index 1036d8b310d666..4757c0ea6c80bd 100644
--- a/src/plugins/intel_cpu/src/nodes/def_conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/def_conv.cpp
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <cmath>
 #include <common/c_types_map.hpp>
-#include <common/dnnl_thread.hpp>
 #include <common/utils.hpp>
 #include <cpu/x64/cpu_isa_traits.hpp>
 #include <cstddef>
@@ -1051,7 +1050,7 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights(const float*
         }
     };
 
-    parallel_nd(MB, DG, OH, OW, [&](dim_t mb, dim_t dg, dim_t oh, dim_t ow) {
+    parallel_for4d(MB, DG, OH, OW, [&](dim_t mb, dim_t dg, dim_t oh, dim_t ow) {
         precompKer(static_cast<int>(mb), static_cast<int>(dg), static_cast<int>(oh), static_cast<int>(ow));
     });
 }
@@ -1133,7 +1132,7 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor(
     jcp.ur_w = mayiuse(cpu::x64::avx512_core) ? 6 : 3;
     jcp.nb_oc_blocking = !mayiuse(cpu::x64::avx2) ? 2 : 4;
 
-    jcp.nthr = dnnl_get_max_threads();
+    jcp.nthr = parallel_get_max_threads();
 }
 
 DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(
@@ -1230,7 +1229,7 @@ void DeformableConvolution::DefConvRefExecutor::exec(const float* src,
         return d;
     };
 
-    parallel_nd(G, MB, OC, OH, OW, [&](dnnl_dim_t g, dnnl_dim_t mb, dnnl_dim_t oc, dnnl_dim_t oh, dnnl_dim_t ow) {
+    parallel_for5d(G, MB, OC, OH, OW, [&](dnnl_dim_t g, dnnl_dim_t mb, dnnl_dim_t oc, dnnl_dim_t oh, dnnl_dim_t ow) {
         dst[mb * dstStrides[0] + (g * OC + oc) * dstStrides[1] + oh * dstStrides[2] + ow * dstStrides[3]] =
             compKer(static_cast<int>(g),
                     static_cast<int>(mb),
diff --git a/src/plugins/intel_cpu/src/nodes/detection_output.cpp b/src/plugins/intel_cpu/src/nodes/detection_output.cpp
index 8eedfe8ae2241a..a743df1cc8494f 100644
--- a/src/plugins/intel_cpu/src/nodes/detection_output.cpp
+++ b/src/plugins/intel_cpu/src/nodes/detection_output.cpp
@@ -25,7 +25,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "shape_inference/shape_inference_cpu.hpp"
@@ -197,6 +196,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) {
     const auto* priorData = getSrcDataAtPortAs<const float>(ID_PRIOR);
     const float* ARMConfData = inputShapes.size() > 3 ? getSrcDataAtPortAs<const float>(ID_ARM_CONF) : nullptr;
     const float* ARMLocData = inputShapes.size() > 4 ? getSrcDataAtPortAs<const float>(ID_ARM_LOC) : nullptr;
+    const auto& cpu_parallel = context->getCpuParallel();
 
     float* reorderedConfData = reorderedConf.data();
     auto* reorderedConfDataIndices = reinterpret_cast<int*>(reorderedConf.data());
@@ -356,7 +356,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) {
     for (int n = 0; n < imgNum; ++n) {
         if (!decreaseClassId) {
             // Caffe style
-            parallel_for(classesNum, [&](int c) {
+            cpu_parallel->parallel_for(classesNum, [&](int c) {
                 if (c != backgroundClassId) {  // Ignore background class
                     const int off = n * priorsNum * classesNum + c * priorsNum;
                     const float* pconfReorder = reorderedConfData + off;
@@ -401,7 +401,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) {
         }
 
         int detectionsTotal = 0;
-        detectionsTotal = parallel_sum(classesNum, detectionsTotal, [&](size_t c) -> int {
+        detectionsTotal = cpu_parallel->parallel_sum(classesNum, detectionsTotal, [&](size_t c) -> int {
             return detectionsData[n * classesNum + c];
         });
 
@@ -410,7 +410,7 @@ void DetectionOutput::execute([[maybe_unused]] const dnnl::stream& strm) {
             std::vector<std::pair<float, std::pair<int, int>>> confIndicesClassMap;
 
             std::mutex mtx;
-            parallel_for(classesNum, [&](int c) {
+            cpu_parallel->parallel_for(classesNum, [&](int c) {
                 const int detections = detectionsData[n * classesNum + c];
                 int* pindices = indicesData + n * classesNum * priorsNum + c * priorsNum;
 
@@ -478,7 +478,8 @@ inline void DetectionOutput::confFilterMX(const float* confData,
                                           int* detectionsData,
                                           const int& n) {
     std::mutex mtx;
-    parallel_for(numPriorsActual[n], [&](size_t p) {
+    const auto& cpu_parallel = context->getCpuParallel();
+    cpu_parallel->parallel_for(numPriorsActual[n], [&](size_t p) {
         // in:  origin conf
         // out: pindices, detectionCount
         // intentionally code branch from higher level
@@ -553,8 +554,9 @@ inline void DetectionOutput::getActualPriorNum(const float* priorData, int* numP
 inline void DetectionOutput::confReorderDense(const float* confData,
                                               const float* ARMConfData,
                                               float* reorderedConfData) const {
+    const auto& cpu_parallel = context->getCpuParallel();
     if (withAddBoxPred) {
-        parallel_for2d(imgNum, priorsNum, [&](size_t n, size_t p) {
+        cpu_parallel->parallel_for2d(imgNum, priorsNum, [&](size_t n, size_t p) {
             if (ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore) {
                 for (int c = 0; c < classesNum; ++c) {
                     reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] =
@@ -570,7 +572,7 @@ inline void DetectionOutput::confReorderDense(const float* confData,
         return;
     }
     // withAddBoxPred is false
-    parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) {
+    cpu_parallel->parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) {
         const int offset = n * priorsNum * classesNum;
         for (int p = 0; p < priorsNum; ++p) {
             reorderedConfData[offset + c * priorsNum + p] = confData[offset + p * classesNum + c];
@@ -584,6 +586,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
                                                             [[maybe_unused]] int* indicesData,
                                                             int* indicesBufData,
                                                             int* detectionsData) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto* reorderedConfDataIndices = reinterpret_cast<int*>(reorderedConfData);
     for (int n = 0; n < imgNum; ++n) {
         const int off = n * priorsNum * classesNum;
@@ -591,13 +594,13 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
 
         const int offH = n * confInfoLen * classesNum;  // horizontal info
         // reset count
-        parallel_for(classesNum, [&](size_t c) {
+        cpu_parallel->parallel_for(classesNum, [&](size_t c) {
             const int countIdx = offH + c * confInfoLen + priorsNum;
             reorderedConfDataIndices[countIdx] = 0;
         });
 
         std::mutex mtx;
-        parallel_for(numPriorsActual[n], [&](size_t p) {
+        cpu_parallel->parallel_for(numPriorsActual[n], [&](size_t p) {
             // intentionally code branch from higher level
             if (withAddBoxPred) {
                 const bool isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore;
@@ -649,7 +652,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
             }
         });
         // topk
-        parallel_for(classesNum, [&](size_t c) {
+        cpu_parallel->parallel_for(classesNum, [&](size_t c) {
             // in:  conf_h info
             // out: buffer, detectionCount(k)
             if (c == static_cast<size_t>(backgroundClassId)) {  // Ignore background class
@@ -675,12 +678,13 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat
                                                             int* indicesData,
                                                             int* indicesBufData,
                                                             int* detectionsData) {
+    const auto& cpu_parallel = context->getCpuParallel();
     for (int n = 0; n < imgNum; ++n) {
         const int off = n * priorsNum * classesNum;
         const int offV = n * priorsNum;  // vertical info
 
         std::mutex mtx;
-        parallel_for(numPriorsActual[n], [&](size_t p) {
+        cpu_parallel->parallel_for(numPriorsActual[n], [&](size_t p) {
             bool isARMPrior = false;
             if (withAddBoxPred) {
                 isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore;
@@ -751,13 +755,14 @@ inline void DetectionOutput::decodeBBoxes(const float* priorData,
                                           const int* confInfoH,
                                           const int* confInfoV) const {
     int prNum = numPriorsActual[n];
+    const auto& cpu_parallel = context->getCpuParallel();
     if (!decodeType) {
         prNum = priorsNum;
     }
     if (isSparsityWorthwhile && !isShareLoc && !decreaseClassId && confInfoH[priorsNum] == 0) {
         return;
     }
-    parallel_for(prNum, [&](int p) {
+    cpu_parallel->parallel_for(prNum, [&](int p) {
         if (isSparsityWorthwhile && isShareLoc && confInfoV[p] == -1) {
             return;
         }
diff --git a/src/plugins/intel_cpu/src/nodes/dft.cpp b/src/plugins/intel_cpu/src/nodes/dft.cpp
index 22cf03f072b072..f772277a49f771 100644
--- a/src/plugins/intel_cpu/src/nodes/dft.cpp
+++ b/src/plugins/intel_cpu/src/nodes/dft.cpp
@@ -322,6 +322,7 @@ void DFT::dftNd(float* output,
                 bool inverse) const {
     const std::vector<size_t> iterationRange(outputShape.begin(), outputShape.end() - 1);
     const size_t lastDimIndex = iterationRange.size() - 1;
+    const auto& cpu_parallel = context->getCpuParallel();
     for (size_t currentAxis : axes) {
         const size_t outputComplexLen = outputShape[currentAxis];
         const size_t outputLen = outputComplexLen * 2;
@@ -330,7 +331,7 @@ void DFT::dftNd(float* output,
         if (IsPowerOfTwo(outputComplexLen)) {
             size_t parallelDimIndex = lastDimIndex == currentAxis ? lastDimIndex - 1 : lastDimIndex;
             do {
-                parallel_for(iterationRange[parallelDimIndex], [&](size_t dim) {
+                cpu_parallel->parallel_for(iterationRange[parallelDimIndex], [&](size_t dim) {
                     std::vector<float> gatheredData(outputLen * 2);
                     auto parallelIterationCounter = iterationCounter;
                     parallelIterationCounter[parallelDimIndex] = dim;
@@ -377,6 +378,7 @@ void DFT::fft(float* inBuffer,
     static int cacheSizeL3 = dnnl::utils::get_cache_size(3, false);
     static int elementsPerCacheLine = cacheSizeL3 / sizeof(float);
     size_t nComplex = dataLength / 2;
+    const auto& cpu_parallel = context->getCpuParallel();
 
     std::function<void(const size_t, const size_t, const size_t)> blockIteration;
     if (fftKernel != nullptr) {
@@ -428,7 +430,7 @@ void DFT::fft(float* inBuffer,
         blockSize = nextIterationBlockSize;
         nextIterationBlockSize /= 2;
         if (parallelize && blockSize >= 4 * static_cast<size_t>(elementsPerCacheLine)) {
-            parallel_for(numBlocks, [&](const size_t block) {
+            cpu_parallel->parallel_for(numBlocks, [&](const size_t block) {
                 blockIteration(block, 1, nextIterationBlockSize);
             });
         } else {
@@ -455,6 +457,7 @@ void DFT::naiveDFT(float* data, size_t dataLength, bool inverse) const {
         CPU_NODE_THROW("Twiddles for nComplex=", nComplex, " not found");
     }
     const auto& twiddles = twiddlesIter->second;
+    const auto& cpu_parallel = context->getCpuParallel();
 
     std::function<void(size_t)> blockIteration;
     if (dftKernel != nullptr) {
@@ -500,7 +503,7 @@ void DFT::naiveDFT(float* data, size_t dataLength, bool inverse) const {
         };
     }
 
-    parallel_for(nComplex, blockIteration);
+    cpu_parallel->parallel_for(nComplex, blockIteration);
     cpu_memcpy(data, outputBuffer.data(), dataLength * sizeof(float));
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
index 246b5db1a7476d..b7184de8771ddf 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp
@@ -49,6 +49,7 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "post_ops.hpp"
+#include "thread_pool_imp.hpp"
 #include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 
@@ -151,7 +152,7 @@ std::optional<MemoryPtr> acl_fc_executor::reorderDataFallback(const MemoryPtr& i
         auto convertOutput = *convertOutputOpt;
 
         if (reorderWithoutConvert) {
-            dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order);
+            dnnl::stream loc_stream = make_stream(output->getPrimitive().get_engine(), context->getThreadPool());
             reorderWithoutConvert.execute(
                 loc_stream,
                 {{DNNL_ARG_FROM, convertOutput->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
@@ -198,7 +199,7 @@ MemoryPtr acl_fc_executor::reorderData(const DnnlMemoryDescPtr& srcWeightDesc,
     }
     // if precision conversion does not work then do direct reference reorder
     if (directReorder) {
-        dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
+        dnnl::stream loc_stream = make_stream(engine, context->getThreadPool());
         directReorder.execute(loc_stream,
                               {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
     } else {
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp
index 4628a6e7686faa..3fafe60c8f6c2d 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp
@@ -47,6 +47,7 @@
 #include "openvino/core/type/element_type.hpp"
 #include "post_ops.hpp"
 #include "shape_inference/custom/convolution.hpp"
+#include "thread_pool_imp.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/general_utils.h"
 
@@ -864,6 +865,7 @@ std::shared_ptr<DnnlConvolutionPrimitive> DnnlConvolutionPrimitive::create(
     auto builder = [&context, defaultImplType](const Key& dnnlKey) {
         return std::make_shared<DnnlConvolutionPrimitive>(dnnlKey,
                                                           context->getEngine(),
+                                                          context->getThreadPool(),
                                                           context->getImplPriorities(),
                                                           defaultImplType);
     };
@@ -1013,9 +1015,10 @@ bool DnnlConvolutionPrimitive::isNspcAvailable(const ConvConfig& config) {
 
 DnnlConvolutionPrimitive::DnnlConvolutionPrimitive(const Key& key,
                                                    const dnnl::engine& engine,
+                                                   const std::shared_ptr<ThreadPool>& threadPool,
                                                    const std::vector<impl_desc_type>& implPriorities,
                                                    const impl_desc_type defaultImplType)
-    : m_stream(dnnl::stream(engine)),
+    : m_stream(make_stream(engine, threadPool)),
       m_primDesc(createPrimitiveDesc(key.src->getDnnlDesc(),
                                      key.wei->getDnnlDesc(),
                                      key.bias->getDnnlDesc(),
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp
index 2f80c0c8460994..3a002cdca6a6aa 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.hpp
@@ -20,6 +20,7 @@
 #include "nodes/executors/fullyconnected_config.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "onednn/iml_type_mapper.h"
+#include "thread_pool_imp.hpp"
 
 namespace ov::intel_cpu {
 
@@ -65,6 +66,7 @@ class DnnlConvolutionPrimitive {
 public:
     DnnlConvolutionPrimitive(const Key& key,
                              const dnnl::engine& engine,
+                             const std::shared_ptr<ThreadPool>& threadPool,
                              const std::vector<impl_desc_type>& implPriorities,
                              impl_desc_type defaultImplType);
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
index aa7680275bebe8..1d502efc6ec9a3 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp
@@ -37,6 +37,7 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/type/element_type.hpp"
+#include "thread_pool_imp.hpp"
 #include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/general_utils.h"
@@ -107,7 +108,10 @@ std::shared_ptr<DnnlFCPrimitive> DnnlFCPrimitive::create(const MemoryArgs& memor
                   attrs.modelType};
 
     auto builder = [&context](const Key& dnnlKey) {
-        return std::make_shared<DnnlFCPrimitive>(dnnlKey, context->getEngine(), context->getImplPriorities());
+        return std::make_shared<DnnlFCPrimitive>(dnnlKey,
+                                                 context->getEngine(),
+                                                 context->getThreadPool(),
+                                                 context->getImplPriorities());
     };
 
     auto runtimeCache = context->getRuntimeCache();
@@ -479,8 +483,9 @@ static impl_desc_type implTypeFromPrimDesc(const dnnl::primitive_desc& primDesc)
 
 DnnlFCPrimitive::DnnlFCPrimitive(const Key& key,
                                  const dnnl::engine& engine,
+                                 const std::shared_ptr<ThreadPool>& threadPool,
                                  const std::vector<impl_desc_type>& implPriorities)
-    : m_stream(dnnl::stream(engine)),
+    : m_stream(make_stream(engine, threadPool)),
       m_primDesc(createPrimitiveDesc(
           key.src->getDnnlDesc(),
           key.wei->getDnnlDesc(),
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp
index ed381da5559a22..f8638e8b2efa53 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp
@@ -19,6 +19,7 @@
 #include "nodes/executors/memory_arguments.hpp"
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/type/element_type.hpp"
+#include "thread_pool_imp.hpp"
 
 namespace ov::intel_cpu {
 
@@ -37,7 +38,10 @@ class DnnlFCPrimitive {
     };
 
 public:
-    DnnlFCPrimitive(const Key& key, const dnnl::engine& engine, const std::vector<impl_desc_type>& implPriorities);
+    DnnlFCPrimitive(const Key& key,
+                    const dnnl::engine& engine,
+                    const std::shared_ptr<ThreadPool>& threadPool,
+                    const std::vector<impl_desc_type>& implPriorities);
 
     void execute(const dnnl_primitive_args& primArgs) const;
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
index e4695ad7e2d96b..57cd6599bf3378 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
@@ -37,6 +37,7 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "post_ops.hpp"
+#include "thread_pool_imp.hpp"
 #include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/general_utils.h"
@@ -137,6 +138,7 @@ std::shared_ptr<DnnlMatMulPrimitive> DnnlMatMulPrimitive::create(const MemoryArg
     auto builder = [&context, defaultImplType](const Key& dnnlKey) {
         return std::make_shared<DnnlMatMulPrimitive>(dnnlKey,
                                                      context->getEngine(),
+                                                     context->getThreadPool(),
                                                      context->getImplPriorities(),
                                                      defaultImplType);
     };
@@ -572,9 +574,10 @@ static impl_desc_type implTypeFromPrimDesc(const dnnl::primitive_desc& primDesc)
 
 DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key,
                                          const dnnl::engine& engine,
+                                         const std::shared_ptr<ThreadPool>& threadPool,
                                          const std::vector<impl_desc_type>& implPriorities,
                                          const impl_desc_type defaultImplType)
-    : m_stream(dnnl::stream(engine)),
+    : m_stream(make_stream(engine, threadPool)),
       m_primDesc(createPrimitiveDesc(key.src->getDnnlDesc(),
                                      key.wei->getDnnlDesc(),
                                      key.bias->getDnnlDesc(),
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp
index 826e13e8ee3082..29a5412d925459 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.hpp
@@ -19,6 +19,7 @@
 #include "nodes/executors/memory_arguments.hpp"
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/type/element_type.hpp"
+#include "thread_pool_imp.hpp"
 
 namespace ov::intel_cpu {
 
@@ -41,6 +42,7 @@ class DnnlMatMulPrimitive {
 public:
     DnnlMatMulPrimitive(const Key& key,
                         const dnnl::engine& engine,
+                        const std::shared_ptr<ThreadPool>& threadPool,
                         const std::vector<impl_desc_type>& implPriorities,
                         impl_desc_type defaultImplType);
 
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp
index 79bb26a4383fd0..7719c4270c1810 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp
@@ -22,6 +22,7 @@
 #include "nodes/reorder.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/type/element_type.hpp"
+#include "thread_pool_imp.hpp"
 #include "weights_cache.hpp"
 
 namespace ov::intel_cpu::utils {
@@ -41,6 +42,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc,
                                 context->getRuntimeCache(),
                                 context->getWeightsCache(),
                                 privateWeightCache,
+                                context->getThreadPool(),
                                 needShiftSignedToUnsigned);
 }
 
@@ -51,6 +53,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc,
                                const MultiCachePtr& rtCache,
                                const WeightsSharing::Ptr& globalWeightCache,
                                const std::shared_ptr<std::unordered_map<std::string, MemoryPtr>>& privateWeightCache,
+                               const std::shared_ptr<ThreadPool>& threadPool,
                                bool needShiftSignedToUnsigned) {
     const auto format = dstWeightDesc->serializeFormat();
     if (privateWeightCache) {
@@ -71,7 +74,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc,
             // prevent reorderData from doing conversion
             Memory srcMemory{eng, srcWeightDesc->cloneWithNewPrecision(dst_wdt), weightsMem->getData()};
             MemoryPtr _ptr = std::make_shared<Memory>(eng, dstWeightDesc);
-            node::Reorder::reorderData(srcMemory, *_ptr, rtCache);
+            node::Reorder::reorderData(srcMemory, *_ptr, rtCache, threadPool);
 
             // do shift
             auto count = _ptr->getSize() / _ptr->getDesc().getPrecision().size();
@@ -95,7 +98,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc,
 
         Memory srcMemory{eng, srcWeightDesc, weightsMem->getData()};
         MemoryPtr _ptr = std::make_shared<Memory>(eng, dstWeightDesc);
-        node::Reorder::reorderData(srcMemory, *_ptr, rtCache);
+        node::Reorder::reorderData(srcMemory, *_ptr, rtCache, threadPool);
 
         return _ptr;
     };
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp
index 1df2bdcbf2edfa..03edaa3e654a2f 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.hpp
@@ -31,5 +31,6 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr& srcWeightDesc,
                                const MultiCachePtr& rtCache,
                                const WeightsSharing::Ptr& globalWeightCache,
                                const std::shared_ptr<std::unordered_map<std::string, MemoryPtr>>& privateWeightCache,
+                               const std::shared_ptr<ThreadPool>& threadPool,
                                bool needShiftSignedToUnsigned = false);
 }  // namespace ov::intel_cpu::utils
diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp
index cf22fe07f08442..c877f4d6b9ab13 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp
@@ -61,7 +61,8 @@ class ExecutorContext {
           engine(graphContext->getEngine()),
           implPriorities(std::move(implPriorities)),
           privateWeighCache(std::move(privateWeighCache)),
-          numNumaNodes(graphContext->getNumNumaNodes()) {
+          numNumaNodes(graphContext->getNumNumaNodes()),
+          cpuParallel(graphContext->getCpuParallel()) {
         auto cpuStreamsExecutor = graphContext->getCPUStreamExecutor();
         curNumaNodeId = std::max(0, cpuStreamsExecutor ? cpuStreamsExecutor->get_numa_node_id() : curNumaNodeId);
     }
@@ -92,6 +93,10 @@ class ExecutorContext {
         return weightsCache;
     }
 
+    [[nodiscard]] std::shared_ptr<ThreadPool> getThreadPool() const {
+        return cpuParallel->get_thread_pool();
+    }
+
 private:
     // weak_ptr is required to avoid cycle dependencies with MultiCache
     // since ExecutorContext is stored in Executor itself
@@ -104,6 +109,7 @@ class ExecutorContext {
     std::shared_ptr<std::unordered_map<std::string, MemoryPtr>> privateWeighCache;
     int numNumaNodes;
     int curNumaNodeId = -1;
+    std::shared_ptr<CpuParallel> cpuParallel;
 };
 
 class ExecutorFactoryLegacy {
diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp
index 562c9fe0c84734..6941fff88acf23 100644
--- a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp
@@ -12,7 +12,6 @@
 #include <cassert>
 #include <cmath>
 #include <common/c_types_map.hpp>
-#include <common/dnnl_thread.hpp>
 #include <common/nstl.hpp>
 #include <common/utils.hpp>
 #include <cpu/x64/cpu_isa_traits.hpp>
@@ -1700,6 +1699,7 @@ void FakeQuantize::createPrimitive() {
 }
 
 void FakeQuantize::executeReference() {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto srcMemory = getSrcMemoryAtPort(0);
     auto dstMemory = getDstMemoryAtPort(0);
 
@@ -1745,7 +1745,7 @@ void FakeQuantize::executeReference() {
         const auto* thresholds = internalBlobMemory[0]->getDataAs<const float>();
         const auto* output_mask = internalBlobMemory[1]->getDataAs<const uint32_t>();
 
-        parallel_nd(N, CB, D, H, W, [&](dim_t n, dim_t cb, dim_t d, dim_t h, dim_t w) {
+        cpu_parallel->parallel_for5d(N, CB, D, H, W, [&](dim_t n, dim_t cb, dim_t d, dim_t h, dim_t w) {
             uint8_t bin_val = 0x00;
             for (int c = static_cast<int>(cb) * nbits, shift = 0;
                  c < std::min(static_cast<int>(C), (static_cast<int>(cb) + 1) * nbits);
@@ -1778,7 +1778,7 @@ void FakeQuantize::executeReference() {
     } else {
         auto* dst = dstMemory->getDataAs<float>();
 
-        parallel_nd(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) {
+        cpu_parallel->parallel_for5d(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) {
             size_t src_off = n * s_str[0];
             if (srcDims.size() == 5) {
                 src_off += c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4];
@@ -1826,6 +1826,7 @@ void FakeQuantize::executeReference() {
 }
 void FakeQuantize::executeBinarization(const std::unique_ptr<jit_uni_quantize_kernel>& pKernel) const {
 #if defined(OPENVINO_ARCH_X86_64)
+    const auto& cpu_parallel = context->getCpuParallel();
     auto srcMemory = getSrcMemoryAtPort(0);
     auto dstMemory = getDstMemoryAtPort(0);
 
@@ -1852,7 +1853,7 @@ void FakeQuantize::executeBinarization(const std::unique_ptr<jit_uni_quantize_ke
 
     int nbits = 8;
 
-    parallel_nd(N, H, W, [&](dim_t n, dim_t h, dim_t w) {
+    cpu_parallel->parallel_for3d(N, H, W, [&](dim_t n, dim_t h, dim_t w) {
         auto arg = jit_quantize_call_args();
 
         arg.from = &src[(n * s_str[0] + h * s_str[2] + w * s_str[3]) * sizeof(float)];
@@ -1868,6 +1869,7 @@ void FakeQuantize::executeBinarization(const std::unique_ptr<jit_uni_quantize_ke
 
 void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_kernel>& pKernel) const {
 #if defined(OPENVINO_ARCH_X86_64)
+    const auto& cpu_parallel = context->getCpuParallel();
     auto srcMemory = getSrcMemoryAtPort(0);
     auto dstMemory = getDstMemoryAtPort(0);
 
@@ -1924,7 +1926,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_ke
     }
 
     if (srcDesc.hasLayoutType(LayoutType::ncsp) && srcDesc.getShape().getRank() == 3) {
-        parallel_nd(N, CB, D, [&](dim_t n, dim_t cb, [[maybe_unused]] dim_t d) {
+        cpu_parallel->parallel_for3d(N, CB, D, [&](dim_t n, dim_t cb, [[maybe_unused]] dim_t d) {
             auto arg = jit_quantize_call_args();
 
             int c = static_cast<int>(cb) * blk_size;
@@ -1955,7 +1957,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_ke
     } else if (jqp.is_planar && srcDims.size() > 2) {
         const int batch_size = 256;
         const int B = div_up(H * W, batch_size);
-        parallel_nd(N, CB, D, B, [&](dim_t n, dim_t cb, dim_t d, dim_t b) {
+        cpu_parallel->parallel_for4d(N, CB, D, B, [&](dim_t n, dim_t cb, dim_t d, dim_t b) {
             auto arg = jit_quantize_call_args();
 
             const int c = static_cast<int>(cb) * blk_size;
@@ -1990,7 +1992,7 @@ void FakeQuantize::executeQuantization(const std::unique_ptr<jit_uni_quantize_ke
             (*pKernel)(&arg);
         });
     } else {
-        parallel_nd_legacy(N, CB, D, H, [&](dim_t n, dim_t cb, dim_t d, dim_t h) {
+        cpu_parallel->parallel_for4d(N, CB, D, H, [&](dim_t n, dim_t cb, dim_t d, dim_t h) {
             auto arg = jit_quantize_call_args();
 
             int c = static_cast<int>(cb) * blk_size;
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 4ac0de259b88fa..cee2e9be3a1db5 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -38,7 +38,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/threading/cpu_message.hpp"
@@ -323,6 +322,7 @@ void FullyConnected::initTensorParallelSync() {
 
 void FullyConnected::execTensorParallelSync() {
     if (tp_cfg.enable_tensor_parallel) {
+        const auto& cpu_parallel = context->getCpuParallel();
         // dst
         auto dst = getDstMemoryAtPort(0);
         auto* dst_ptr = static_cast<uint8_t*>(dst->getData());
@@ -364,7 +364,7 @@ void FullyConnected::execTensorParallelSync() {
                     const auto copySize = splited_dim_vec[idx] * prec.size();  // bytes of half selected dim.
                     const size_t unloop = 8;
                     size_t step = count / unloop;
-                    parallel_for(step, [&](size_t i) {
+                    cpu_parallel->parallel_for(step, [&](size_t i) {
                         cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop) * channel_size,
                                    new_ptr + (i * unloop) * copySize,
                                    copySize);
diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp
index f99cc6a469efb9..808ec3198968a0 100644
--- a/src/plugins/intel_cpu/src/nodes/gather.cpp
+++ b/src/plugins/intel_cpu/src/nodes/gather.cpp
@@ -680,6 +680,7 @@ void Gather::initShortParams(threadExecParams& p, const uint64_t start) {
 
 template <typename OUT_TYPE, int8_t get4Bit(const uint8_t&, bool)>
 void Gather::execCompressed4Bit() {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* srcIndices = getSrcDataAtPortAs<const int32_t>(GATHER_INDICES);
     const auto* srcData = getSrcDataAtPortAs<const uint8_t>(GATHER_DATA);
     auto* dstData = getDstDataAtPortAs<OUT_TYPE>(0);
@@ -690,7 +691,7 @@ void Gather::execCompressed4Bit() {
     const auto* scale = getSrcDataAtPortAs<float_t>(GATHER_SCALE);
 
     const size_t dstAfterBatchSize = betweenBatchAndAxisSize * specIdxAndAfterAxSize;
-    parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) {
+    cpu_parallel->parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) {
         int ii = srcIndices[b * specIndicesSize + j];
         if (ii < 0) {
             if (reverseIndexing) {
@@ -764,6 +765,7 @@ void Gather::execCompressed4Bit() {
 
 template <typename OUT_TYPE, typename IN_TYPE>
 void Gather::execCompressed8Bit() {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* srcIndices = getSrcDataAtPortAs<const int32_t>(GATHER_INDICES);
     const auto* srcData = getSrcDataAtPortAs<const IN_TYPE>(GATHER_DATA);
     auto* dstData = getDstDataAtPortAs<OUT_TYPE>(0);
@@ -775,7 +777,7 @@ void Gather::execCompressed8Bit() {
 
     const size_t dstAfterBatchSize = betweenBatchAndAxisSize * specIdxAndAfterAxSize;
 
-    parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) {
+    cpu_parallel->parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) {
         int ii = srcIndices[b * specIndicesSize + j];
         if (ii < 0) {
             if (reverseIndexing) {
@@ -925,12 +927,13 @@ void Gather::execCompressed() {
 }
 
 void Gather::execReference() {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* srcIndices = getSrcDataAtPortAs<const int32_t>(GATHER_INDICES);
     const auto* srcData = getSrcDataAtPortAs<const uint8_t>(GATHER_DATA);
     auto* dstData = getDstDataAtPortAs<uint8_t>(0);
 
     const size_t dstAfterBatchSize = betweenBatchAndAxisSize * specIdxAndAfterAxSizeBOut;
-    parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) {
+    cpu_parallel->parallel_for2d(beforeBatchSize, specIndicesSize, [&](const size_t b, const size_t j) {
         int ii = srcIndices[b * specIndicesSize + j];
         if (ii < 0) {
             if (reverseIndexing) {
diff --git a/src/plugins/intel_cpu/src/nodes/grn.cpp b/src/plugins/intel_cpu/src/nodes/grn.cpp
index 0c4d8a643fa7be..f75527bd91e57c 100644
--- a/src/plugins/intel_cpu/src/nodes/grn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/grn.cpp
@@ -17,7 +17,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/grn.hpp"
@@ -102,10 +101,11 @@ void GRN::executeDynamicImpl(const dnnl::stream& strm) {
 }
 
 void GRN::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* src_data = getSrcDataAtPortAs<const float>(0);
     auto* dst_data = getDstDataAtPortAs<float>(0);
 
-    parallel_for3d(N, H, W, [&](int b, int h, int w) {
+    cpu_parallel->parallel_for3d(N, H, W, [&](int b, int h, int w) {
         double variance = 0;
         for (int c = 0; c < C; c++) {
             variance += std::pow(src_data[b * C * H * W + c * H * W + h * W + w], 2);
diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp
index 0393c2390d2d3a..437a0c8f335c00 100644
--- a/src/plugins/intel_cpu/src/nodes/input.cpp
+++ b/src/plugins/intel_cpu/src/nodes/input.cpp
@@ -51,7 +51,6 @@
 #    include <cpu/x64/cpu_isa_traits.hpp>
 
 #    include "cpu/x64/jit_generator.hpp"
-#    include "openvino/core/parallel.hpp"
 
 using namespace dnnl::impl::cpu::x64;
 using namespace Xbyak;
@@ -452,6 +451,7 @@ void Input::cloneBlobIfRequired() {
 #if defined(OPENVINO_ARCH_X86_64)
             auto fn = jit_has_subnormals_function();
             auto fn_bf16_check = jit_has_bf16_overflows_function();
+            const auto& cpu_parallel = context->getCpuParallel();
             if (fn && fn_bf16_check) {
                 static const size_t batch_size = 2048;
                 const size_t iterations_num = size / batch_size + 1;
@@ -459,7 +459,7 @@ void Input::cloneBlobIfRequired() {
                 std::atomic<bool> has_subnormals_local(false);
                 std::atomic<bool> has_bf16_overflows_local(false);
                 if (needFlushDenormalsToZero || do_bf16_saturation_check) {
-                    parallel_for(iterations_num, [&](int n) {
+                    cpu_parallel->parallel_for(iterations_num, [&](int n) {
                         const auto* ptr = f32data + n * batch_size;
                         jit_has_special_value_base::args_t args = {
                             reinterpret_cast<const float*>(ptr),
diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp
index b5db22802b5ac1..cb24258df6c509 100644
--- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp
+++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp
@@ -2662,6 +2662,7 @@ std::vector<float> Interpolate::getScales(const VectorDims& srcDimPad, const Vec
 }
 
 void Interpolate::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto dstMemPtr = getDstMemoryAtPort(0);
     auto srcMemPtr = getSrcMemoryAtPort(DATA_ID);
 
@@ -2690,35 +2691,46 @@ void Interpolate::execute([[maybe_unused]] const dnnl::stream& strm) {
             if (interpAttrs.layout == InterpolateLayoutType::planar) {
                 srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
                 auto* src_data_pad = static_cast<uint8_t*>(srcPadded.data());
-                parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
-                    const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c +
-                                                            inShapeBlock[3] * d + inShapeBlock[4] * h) *
-                                                               srcDataSize;
-                    uint8_t* srcPad =
-                        src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
-                                        inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) *
-                                           srcDataSize;
-                    cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
-                });
+                cpu_parallel->parallel_for4d(
+                    srcDim5d[0],
+                    srcDim5d[1],
+                    srcDim5d[2],
+                    srcDim5d[3],
+                    [&](int n, int c, int d, int h) {
+                        const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c +
+                                                                inShapeBlock[3] * d + inShapeBlock[4] * h) *
+                                                                   srcDataSize;
+                        uint8_t* srcPad =
+                            src_data_pad +
+                            (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) +
+                             inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) *
+                                srcDataSize;
+                        cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
+                    });
                 src_data = src_data_pad;
             } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) {
                 srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
                 auto* src_data_pad = static_cast<uint8_t*>(srcPadded.data());
-                parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
-                    const uint8_t* src =
-                        src_data_origin +
-                        (inShapeBlock[1] * n +
-                         (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) *
-                            srcDataSize;
-                    uint8_t* srcPad =
-                        src_data_pad + (inShapePadBlock[1] * (n + padB0) +
-                                        (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) +
-                                         inShapePadBlock[5] * (w + padB4)) *
-                                            srcDimPad5d[1] +
-                                        padB1) *
-                                           srcDataSize;
-                    cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
-                });
+                cpu_parallel->parallel_for4d(
+                    srcDim5d[0],
+                    srcDim5d[2],
+                    srcDim5d[3],
+                    srcDim5d[4],
+                    [&](int n, int d, int h, int w) {
+                        const uint8_t* src =
+                            src_data_origin +
+                            (inShapeBlock[1] * n +
+                             (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) *
+                                srcDataSize;
+                        uint8_t* srcPad =
+                            src_data_pad + (inShapePadBlock[1] * (n + padB0) +
+                                            (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) +
+                                             inShapePadBlock[5] * (w + padB4)) *
+                                                srcDimPad5d[1] +
+                                            padB1) *
+                                               srcDataSize;
+                        cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
+                    });
                 src_data = src_data_pad;
             } else if (interpAttrs.layout == InterpolateLayoutType::block) {
                 size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8;
@@ -2728,28 +2740,28 @@ void Interpolate::execute([[maybe_unused]] const dnnl::stream& strm) {
                 auto* src_data_pad = static_cast<uint8_t*>(srcPadded.data());
                 CPU_NODE_ASSERT((srcDim5d[0] == srcDimPad5d[0]) && (srcDim5d[1] == srcDimPad5d[1]),
                                 "does not support padding on batch and channel dimensions");
-                parallel_for5d(srcDim5d[0],
-                               CB,
-                               srcDim5d[2],
-                               srcDim5d[3],
-                               srcDim5d[4],
-                               [&](int n, int cb, int d, int h, int w) {
-                                   const uint8_t* src =
-                                       src_data_origin +
-                                       (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize +
-                                       (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize +
-                                       (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize +
-                                       (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize;
-                                   uint8_t* srcPad =
-                                       src_data_pad +
-                                       (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) *
-                                           srcDataSize +
-                                       (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize +
-                                       ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize +
-                                       ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize +
-                                       ((w + padB4) * blkSize) * srcDataSize;
-                                   cpu_memcpy(srcPad, src, blkSize * srcDataSize);
-                               });
+                cpu_parallel->parallel_for5d(
+                    srcDim5d[0],
+                    CB,
+                    srcDim5d[2],
+                    srcDim5d[3],
+                    srcDim5d[4],
+                    [&](int n, int cb, int d, int h, int w) {
+                        const uint8_t* src =
+                            src_data_origin +
+                            (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize +
+                            (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize +
+                            (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize +
+                            (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize;
+                        uint8_t* srcPad =
+                            src_data_pad +
+                            (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize +
+                            (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize +
+                            ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize +
+                            ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize +
+                            ((w + padB4) * blkSize) * srcDataSize;
+                        cpu_memcpy(srcPad, src, blkSize * srcDataSize);
+                    });
                 src_data = src_data_pad;
             }
         } else {
diff --git a/src/plugins/intel_cpu/src/nodes/inverse.cpp b/src/plugins/intel_cpu/src/nodes/inverse.cpp
index 387e83fa6d188a..012dff080ba04a 100644
--- a/src/plugins/intel_cpu/src/nodes/inverse.cpp
+++ b/src/plugins/intel_cpu/src/nodes/inverse.cpp
@@ -21,7 +21,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/partial_shape.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
@@ -124,17 +123,18 @@ void Inverse::lu_decomposition(const float* data,
                                size_t b) const {
     // Make L identity, U a copy of data and P a range(0, side)
     const auto batch_idx = b * m_side_squared;
+    const auto& cpu_parallel = context->getCpuParallel();
 
     std::fill(L.begin(), L.end(), 0.0F);
     if (!m_adjoint) {
         cpu_parallel_memcpy(U.data(), &data[batch_idx], sizeof(float) * m_side_squared);
     } else {
-        parallel_for2d(m_side, m_side, [&](size_t i, size_t j) {
+        cpu_parallel->parallel_for2d(m_side, m_side, [&](size_t i, size_t j) {
             U[j * m_side + i] = data[batch_idx + i * m_side + j];
         });
     }
 
-    parallel_for(m_side, [&](size_t i) {
+    cpu_parallel->parallel_for(m_side, [&](size_t i) {
         L[i * m_side + i] = 1.0F;
         P[i] = i;
     });
@@ -156,7 +156,7 @@ void Inverse::lu_decomposition(const float* data,
         if (pivot_row != k) {
             // Swap rows in L, U and P
             std::swap(P[k], P[pivot_row]);
-            parallel_for(m_side, [&](size_t i) {
+            cpu_parallel->parallel_for(m_side, [&](size_t i) {
                 std::swap(L[k_idx + i], L[pivot_idx + i]);
                 std::swap(U[k_idx + i], U[pivot_idx + i]);
             });
@@ -165,12 +165,12 @@ void Inverse::lu_decomposition(const float* data,
         const auto remaining_columns = m_side - k;
         const auto remaining_rows = remaining_columns - 1;
 
-        parallel_for(remaining_rows, [&](size_t i) {
+        cpu_parallel->parallel_for(remaining_rows, [&](size_t i) {
             const auto i_idx = (i + k + 1) * m_side;
             L[i_idx + k] = U[i_idx + k] / U[k_idx + k];
         });
 
-        parallel_for(remaining_rows * remaining_columns, [&](size_t i) {
+        cpu_parallel->parallel_for(remaining_rows * remaining_columns, [&](size_t i) {
             const auto i_idx = (i / remaining_columns + k + 1) * m_side;
             const auto j_idx = i % remaining_columns + k;
             U[i_idx + j_idx] = U[i_idx + j_idx] - L[i_idx + k] * U[k_idx + j_idx];
@@ -183,7 +183,8 @@ void Inverse::lu_solve(float* output,
                        std::vector<float>& U,
                        std::vector<size_t>& P,
                        size_t b) const {
-    parallel_for(m_side, [&](size_t column) {
+    const auto& cpu_parallel = context->getCpuParallel();
+    cpu_parallel->parallel_for(m_side, [&](size_t column) {
         std::vector<float> X(m_side, 0.0F);
         std::vector<float> Y(m_side, 0.0F);
 
diff --git a/src/plugins/intel_cpu/src/nodes/log_softmax.cpp b/src/plugins/intel_cpu/src/nodes/log_softmax.cpp
index 779a9d247279b9..d459cd322a7cf0 100644
--- a/src/plugins/intel_cpu/src/nodes/log_softmax.cpp
+++ b/src/plugins/intel_cpu/src/nodes/log_softmax.cpp
@@ -19,7 +19,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/log_softmax.hpp"
@@ -107,11 +106,12 @@ void LogSoftmax::executeDynamicImpl(const dnnl::stream& strm) {
 }
 
 void LogSoftmax::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* srcData = getSrcDataAtPortAs<const float>(0);
     auto* dstData = getDstDataAtPortAs<float>(0);
 
     if (isLastDim) {
-        parallel_for(axisStep, [&](size_t i) {
+        cpu_parallel->parallel_for(axisStep, [&](size_t i) {
             const float* srcDataPtr = &srcData[i * reducedAxisSize];
             float* dstDataPtr = &dstData[i * reducedAxisSize];
 
@@ -127,7 +127,7 @@ void LogSoftmax::execute([[maybe_unused]] const dnnl::stream& strm) {
             }
         });
     } else {
-        parallel_for2d(axisStep, reducedAxisStride, [&](size_t k, size_t i) {
+        cpu_parallel->parallel_for2d(axisStep, reducedAxisStride, [&](size_t k, size_t i) {
             const float* srcDataPtr = &srcData[k * reducedAxisStride * reducedAxisSize + i];
             float* dstDataPtr = &dstData[k * reducedAxisStride * reducedAxisSize + i];
 
diff --git a/src/plugins/intel_cpu/src/nodes/mathematics.cpp b/src/plugins/intel_cpu/src/nodes/mathematics.cpp
index 39321ffeb44131..c4a468c3f404ea 100644
--- a/src/plugins/intel_cpu/src/nodes/mathematics.cpp
+++ b/src/plugins/intel_cpu/src/nodes/mathematics.cpp
@@ -22,7 +22,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/abs.hpp"
@@ -104,90 +103,91 @@ void Math::execute([[maybe_unused]] const dnnl::stream& strm) {
     size_t dataSize = getChildEdgeAt(0)->getMemory().getShape().getElementsCount();
     const auto* src_data = getSrcDataAtPortAs<const float>(0);
     auto* dst_data = getDstDataAtPortAs<float>(0);
+    const auto& cpu_parallel = context->getCpuParallel();
 
     switch (getAlgorithm()) {
     case Algorithm::MathAbs:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = (std::abs)(src_data[i]);
         });
         break;
     case Algorithm::MathAcos:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = acosf(src_data[i]);
         });
         break;
     case Algorithm::MathAcosh:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = acoshf(src_data[i]);
         });
         break;
     case Algorithm::MathAsin:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = asinf(src_data[i]);
         });
         break;
     case Algorithm::MathAsinh:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = asinhf(src_data[i]);
         });
         break;
     case Algorithm::MathAtan:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = atanf(src_data[i]);
         });
         break;
     case Algorithm::MathAtanh:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = atanhf(src_data[i]);
         });
         break;
     case Algorithm::MathCeiling:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = ceilf(src_data[i]);
         });
         break;
     case Algorithm::MathCos:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = cosf(src_data[i]);
         });
         break;
     case Algorithm::MathCosh:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = coshf(src_data[i]);
         });
         break;
     case Algorithm::MathFloor:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = floorf(src_data[i]);
         });
         break;
     case Algorithm::MathHardSigmoid:
         alpha = (alpha == 0.0F) ? 0.2F : alpha;
         beta = (beta == 0.0F) ? 0.5F : beta;
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = (std::max)(0.F, (std::min)(1.F, alpha * src_data[i] + beta));
         });
         break;
     case Algorithm::MathNegative:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = -src_data[i];
         });
         break;
     case Algorithm::MathReciprocal:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = 1.0F / src_data[i];
         });
         break;
     case Algorithm::MathSelu:
         alpha = (alpha == 0.0F) ? 1.67326F : alpha;
         gamma = (gamma == 0.0F) ? 1.0507F : gamma;
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             float x = src_data[i];
             dst_data[i] = (x > 0.0F) ? (gamma * x) : (gamma * alpha * (std::exp(x) - 1.0F));
         });
         break;
     case Algorithm::MathSign:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             if (src_data[i] > 0.0F) {
                 dst_data[i] = 1.0F;
             } else if (src_data[i] < 0.0F) {
@@ -200,28 +200,28 @@ void Math::execute([[maybe_unused]] const dnnl::stream& strm) {
         });
         break;
     case Algorithm::MathSin:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = sinf(src_data[i]);
         });
         break;
     case Algorithm::MathSinh:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = sinhf(src_data[i]);
         });
         break;
     case Algorithm::MathSoftPlus:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = logf(expf(src_data[i]) + 1);
         });
         break;
     case Algorithm::MathSoftsign:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             float x = src_data[i];
             dst_data[i] = x / (1.F + (std::abs)(x));
         });
         break;
     case Algorithm::MathTan:
-        parallel_for(dataSize, [&](size_t i) {
+        cpu_parallel->parallel_for(dataSize, [&](size_t i) {
             dst_data[i] = tanf(src_data[i]);
         });
         break;
diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp
index 77d34e0e9434cd..cbfbdf3165dca3 100644
--- a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp
+++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp
@@ -216,9 +216,10 @@ size_t MatrixNms::nmsMatrix(const float* boxesData,
 
     std::vector<float> iouMatrix((originalSize * (originalSize - 1)) >> 1);
     std::vector<float> iouMax(originalSize);
+    const auto& cpu_parallel = context->getCpuParallel();
 
     iouMax[0] = 0.;
-    ov::parallel_for(originalSize - 1, [&](size_t i) {
+    cpu_parallel->parallel_for(originalSize - 1, [&](size_t i) {
         float max_iou = 0.;
         size_t actual_index = i + 1;
         auto idx_a = candidateIndex[actual_index];
@@ -336,10 +337,11 @@ void MatrixNms::executeDynamicImpl(const dnnl::stream& strm) {
 }
 
 void MatrixNms::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* boxes = getSrcDataAtPortAs<const float>(NMS_BOXES);
     const auto* scores = getSrcDataAtPortAs<const float>(NMS_SCORES);
 
-    ov::parallel_for2d(m_numBatches, m_numClasses, [&](size_t batchIdx, size_t classIdx) {
+    cpu_parallel->parallel_for2d(m_numBatches, m_numClasses, [&](size_t batchIdx, size_t classIdx) {
         if (classIdx == static_cast<size_t>(m_backgroundClass)) {
             m_numPerBatchClass[batchIdx][classIdx] = 0;
             return;
@@ -356,7 +358,7 @@ void MatrixNms::execute([[maybe_unused]] const dnnl::stream& strm) {
         m_numPerBatchClass[batchIdx][classIdx] = classNumDet;
     });
 
-    ov::parallel_for(m_numBatches, [&](size_t batchIdx) {
+    cpu_parallel->parallel_for(m_numBatches, [&](size_t batchIdx) {
         size_t batchOffset = batchIdx * m_realNumClasses * m_realNumBoxes;
         BoxInfo* batchFilteredBox = m_filteredBoxes.data() + batchOffset;
         auto& numPerClass = m_numPerBatchClass[batchIdx];
diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp
index 3bce1b209a5baa..8cac1838e9bf19 100644
--- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp
+++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp
@@ -490,6 +490,7 @@ void MultiClassNms::nmsWithEta(const float* boxes,
                                const VectorDims& scoresStrides,
                                const VectorDims& roisnumStrides,
                                const bool shared) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto less = [](const boxInfo& l, const boxInfo& r) {
         return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx));
     };
@@ -498,7 +499,7 @@ void MultiClassNms::nmsWithEta(const float* boxes,
         return iou <= adaptive_threshold ? 1.0F : 0.0F;
     };
 
-    parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) {
+    cpu_parallel->parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) {
         if (!shared) {
             if (roisnum[batch_idx] <= 0) {
                 m_numFiltBox[batch_idx][class_idx] = 0;
@@ -608,7 +609,8 @@ void MultiClassNms::nmsWithoutEta(const float* boxes,
                                   const VectorDims& scoresStrides,
                                   const VectorDims& roisnumStrides,
                                   const bool shared) {
-    parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) {
+    const auto& cpu_parallel = context->getCpuParallel();
+    cpu_parallel->parallel_for2d(m_numBatches, m_numClasses, [&](int batch_idx, int class_idx) {
         /*
         // nms over a class over an image
         // boxes:       num_priors, 4
diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.cpp b/src/plugins/intel_cpu/src/nodes/multinomial.cpp
index 348428f85b6be0..c6add9472428c1 100644
--- a/src/plugins/intel_cpu/src/nodes/multinomial.cpp
+++ b/src/plugins/intel_cpu/src/nodes/multinomial.cpp
@@ -22,7 +22,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/core/type/float16.hpp"
 #include "openvino/op/multinomial.hpp"
@@ -184,6 +183,7 @@ template <typename P, typename O>
 void Multinomial::execute_convert_type() {
     const auto* probs = getSrcDataAtPortAs<const P>(PROBS_PORT);
     auto* output = getDstDataAtPortAs<O>(OUTPUT_PORT);
+    const auto& cpu_parallel = context->getCpuParallel();
 
     std::vector<P> m_cdf(m_input_elements_count);
     std::vector<P> m_max_per_batch(m_batches_count);
@@ -191,7 +191,7 @@ void Multinomial::execute_convert_type() {
 
     // exp & cumsum
     if (m_log_probs) {
-        parallel_for(m_batches_count, [&](size_t idx) {
+        cpu_parallel->parallel_for(m_batches_count, [&](size_t idx) {
             const auto start_idx = idx * m_probs_count;
             m_cdf[start_idx] = std::exp(probs[start_idx]);
             for (size_t prev = start_idx, curr = prev + 1; curr < (start_idx + m_probs_count); ++prev, ++curr) {
@@ -199,7 +199,7 @@ void Multinomial::execute_convert_type() {
             }
         });
     } else {
-        parallel_for(m_batches_count, [&](size_t idx_batch) {
+        cpu_parallel->parallel_for(m_batches_count, [&](size_t idx_batch) {
             const auto start_idx = idx_batch * m_probs_count;
             const auto* probs_start_idx = probs + start_idx;
             std::partial_sum(probs_start_idx, probs_start_idx + m_probs_count, m_cdf.begin() + start_idx);
@@ -222,17 +222,17 @@ void Multinomial::execute_convert_type() {
 
     // max & divide
     const auto min_value_of_max = std::numeric_limits<P>::min();
-    parallel_for(m_batches_count, [&](size_t idx) {
+    cpu_parallel->parallel_for(m_batches_count, [&](size_t idx) {
         m_max_per_batch[idx] = std::max(m_cdf[(idx + 1) * m_probs_count - 1], min_value_of_max);
     });
 
-    parallel_for(m_input_elements_count, [&](size_t idx) {
+    cpu_parallel->parallel_for(m_input_elements_count, [&](size_t idx) {
         size_t idx_max_elem = idx / m_probs_count;
         m_cdf[idx] = m_cdf[idx] / m_max_per_batch[idx_max_elem];
     });
 
     if (m_with_replacement) {
-        parallel_for(m_batches_samples_probs_count, [&](size_t idx) {
+        cpu_parallel->parallel_for(m_batches_samples_probs_count, [&](size_t idx) {
             size_t idx_batch = idx / m_samples_probs_count;
             size_t idx_num_samples_probs = idx % m_samples_probs_count;
             size_t idx_prob = idx_num_samples_probs % m_probs_count;
@@ -246,7 +246,7 @@ void Multinomial::execute_convert_type() {
             }
         });
     } else {  // without replacement - adjust cdf after each sample drawn from batch, sequentially
-        parallel_for(m_batches_count, [&](size_t idx_batch) {
+        cpu_parallel->parallel_for(m_batches_count, [&](size_t idx_batch) {
             for (size_t idx_sample = 0LU; idx_sample < m_samples_count; ++idx_sample) {
                 size_t idx_input = idx_batch * m_probs_count;
                 size_t idx_output = idx_batch * m_samples_count + idx_sample;
diff --git a/src/plugins/intel_cpu/src/nodes/ngram.cpp b/src/plugins/intel_cpu/src/nodes/ngram.cpp
index 892bad8c8c7889..1ecf5daeffff7b 100644
--- a/src/plugins/intel_cpu/src/nodes/ngram.cpp
+++ b/src/plugins/intel_cpu/src/nodes/ngram.cpp
@@ -23,7 +23,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "shape_inference/custom/ngram.hpp"
@@ -118,6 +117,7 @@ std::vector<size_t> Ngram::computeBatchLenghts() {
 void Ngram::execute([[maybe_unused]] const dnnl::stream& strm) {
     const auto* srcData = getSrcDataAtPortAs<const float>(0);
     auto* dstData = getDstDataAtPortAs<float>(0);
+    const auto& cpu_parallel = context->getCpuParallel();
 
     std::vector<size_t> batchLenghts;
     if (idcesPrecision == ov::element::i32) {
@@ -133,7 +133,7 @@ void Ngram::execute([[maybe_unused]] const dnnl::stream& strm) {
        2. Apply sliding window of windowSize with a step windowStride and form k new embedding vectors for the embedding
     */
     memset(dstData, 0, numOutElems * sizeof(float));
-    parallel_for(batchLenghts.size() - 1, [&](const size_t batchIdx) {
+    cpu_parallel->parallel_for(batchLenghts.size() - 1, [&](const size_t batchIdx) {
         size_t srcWindowBias = 0;
         size_t dstWindowBias = 0;
 
diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp
index 9e4f4c39519ea7..d86158e755cf95 100644
--- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp
+++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp
@@ -364,6 +364,7 @@ void NonMaxSuppression::nmsWithSoftSigma(const float* boxes,
                                          const VectorDims& boxesStrides,
                                          const VectorDims& scoresStrides,
                                          std::vector<FilteredBox>& filtBoxes) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto less = [](const boxInfo& l, const boxInfo& r) {
         return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx));
     };
@@ -378,7 +379,7 @@ void NonMaxSuppression::nmsWithSoftSigma(const float* boxes,
         return std::exp(m_scale * iou * iou);
     };
 
-    parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) {
+    cpu_parallel->parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) {
         std::vector<FilteredBox> selectedBoxes;
         const float* boxesPtr = boxes + batch_idx * boxesStrides[0];
         const float* scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
@@ -516,8 +517,9 @@ void NonMaxSuppression::nmsWithoutSoftSigma(const float* boxes,
                                             const VectorDims& boxesStrides,
                                             const VectorDims& scoresStrides,
                                             std::vector<FilteredBox>& filtBoxes) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto max_out_box = static_cast<int>(m_output_boxes_per_class);
-    parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) {
+    cpu_parallel->parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) {
         const float* boxesPtr = boxes + batch_idx * boxesStrides[0];
         const float* scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
 
@@ -862,7 +864,8 @@ void NonMaxSuppression::nmsRotated(const float* boxes,
                                    const VectorDims& scores_strides,
                                    std::vector<FilteredBox>& filtered_boxes) {
     CPU_NODE_ASSERT(!m_jit_kernel, "does not have implementation of the JIT kernel for Rotated boxes.");
-    parallel_for2d(m_batches_num, m_classes_num, [&](int64_t batch_idx, int64_t class_idx) {
+    const auto& cpu_parallel = context->getCpuParallel();
+    cpu_parallel->parallel_for2d(m_batches_num, m_classes_num, [&](int64_t batch_idx, int64_t class_idx) {
         const float* boxes_ptr = boxes + batch_idx * boxes_strides[0];
         const float* scores_ptr = scores + batch_idx * scores_strides[0] + class_idx * scores_strides[1];
 
diff --git a/src/plugins/intel_cpu/src/nodes/one_hot.cpp b/src/plugins/intel_cpu/src/nodes/one_hot.cpp
index 8a84ee3b53363a..253693539bf6ae 100644
--- a/src/plugins/intel_cpu/src/nodes/one_hot.cpp
+++ b/src/plugins/intel_cpu/src/nodes/one_hot.cpp
@@ -19,7 +19,6 @@
 #include "openvino/cc/selective_build.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/shape.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
@@ -127,6 +126,7 @@ void OneHot::initSupportedPrimitiveDescriptors() {
 
 template <typename out_type>
 void OneHot::one_hot(size_t prefix_size, size_t suffix_size) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* src_data = getSrcDataAtPortAs<const in_type>(0);
     auto* dst_data = getDstDataAtPortAs<out_type>(0);
 
@@ -139,7 +139,7 @@ void OneHot::one_hot(size_t prefix_size, size_t suffix_size) {
 
     // set on_value at needed locations
     auto on_val = on_value;
-    parallel_for(prefix_size, [&](std::size_t prefix_idx) {
+    cpu_parallel->parallel_for(prefix_size, [&](std::size_t prefix_idx) {
         const in_type* src_dataPtr = &src_data[prefix_idx * suffix_size];
         out_type* dst_dataPtr = &dst_data[prefix_idx * depth * suffix_size];
         for (std::size_t suffix_idx = 0; suffix_idx < suffix_size; ++suffix_idx, ++src_dataPtr, ++dst_dataPtr) {
diff --git a/src/plugins/intel_cpu/src/nodes/priorbox.cpp b/src/plugins/intel_cpu/src/nodes/priorbox.cpp
index 37d4edfee3969f..2e2bfd6d202e0c 100644
--- a/src/plugins/intel_cpu/src/nodes/priorbox.cpp
+++ b/src/plugins/intel_cpu/src/nodes/priorbox.cpp
@@ -21,7 +21,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/prior_box.hpp"
@@ -152,6 +151,7 @@ void PriorBox::createPrimitive() {
 }
 
 void PriorBox::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const int* in_data = getSrcDataAtPortAs<int>(0);
     const int H = in_data[0];
     const int W = in_data[1];
@@ -316,18 +316,18 @@ void PriorBox::execute([[maybe_unused]] const dnnl::stream& strm) {
     }
 
     if (clip) {
-        parallel_for((H * W * number_of_priors * 4), [&](size_t i) {
+        cpu_parallel->parallel_for((H * W * number_of_priors * 4), [&](size_t i) {
             dst_data[i] = (std::min)((std::max)(dst_data[i], 0.0F), 1.0F);
         });
     }
 
     uint64_t channel_size = OH * OW;
     if (variance.size() == 1) {
-        parallel_for(channel_size, [&](size_t i) {
+        cpu_parallel->parallel_for(channel_size, [&](size_t i) {
             dst_data[i + channel_size] = variance[0];
         });
     } else {
-        parallel_for(H * W * number_of_priors, [&](size_t i) {
+        cpu_parallel->parallel_for(H * W * number_of_priors, [&](size_t i) {
             for (size_t j = 0; j < 4; ++j) {
                 dst_data[i * 4 + j + channel_size] = variance[j];
             }
diff --git a/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp b/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp
index d4b8c47b178f87..5954de0e990978 100644
--- a/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp
+++ b/src/plugins/intel_cpu/src/nodes/priorbox_clustered.cpp
@@ -20,7 +20,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/prior_box_clustered.hpp"
@@ -107,6 +106,7 @@ void PriorBoxClustered::createPrimitive() {
 }
 
 void PriorBoxClustered::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const int* in_data = getSrcDataAtPortAs<int>(0);
     const int layer_height = in_data[0];
     const int layer_width = in_data[1];
@@ -126,7 +126,7 @@ void PriorBoxClustered::execute([[maybe_unused]] const dnnl::stream& strm) {
     const auto& out_shape = getChildEdgeAt(0)->getMemory().getShape().getStaticDims();
 
     size_t var_size = variances.size();
-    parallel_for2d(layer_height, layer_width, [&](int64_t h, int64_t w) {
+    cpu_parallel->parallel_for2d(layer_height, layer_width, [&](int64_t h, int64_t w) {
         float center_x = (static_cast<float>(w) + offset) * step_w;
         float center_y = (static_cast<float>(h) + offset) * step_h;
 
diff --git a/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp b/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp
index b5b1224f4d57c9..3e230e9adb237c 100644
--- a/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp
+++ b/src/plugins/intel_cpu/src/nodes/psroi_pooling.cpp
@@ -24,7 +24,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/deformable_psroi_pooling.hpp"
@@ -277,6 +276,7 @@ void PSROIPooling::executeAverage(const inputType* srcData,
                                   const int roiBatchInd,
                                   const BlockedMemoryDesc& srcDesc,
                                   const BlockedMemoryDesc& dstDesc) {
+    const auto& cpu_parallel = context->getCpuParallel();
     int inBlockSize = 0;
     int outBlockSize = 0;
     int outBlockCount = 0;
@@ -338,7 +338,7 @@ void PSROIPooling::executeAverage(const inputType* srcData,
             }
         };
     if (srcDesc.hasLayoutType(LayoutType::nspc)) {
-        parallel_for2d(nh, nw, [&](int h, int w) {
+        cpu_parallel->parallel_for2d(nh, nw, [&](int h, int w) {
             const int binOffsetOutput = n * nc * nh * nw;
             const int binOffsetInput = roiBatchInd * channels * height * width;
             for (int c = 0; c < nc; c++) {
@@ -347,7 +347,7 @@ void PSROIPooling::executeAverage(const inputType* srcData,
             }
         });
     } else if (srcDesc.hasLayoutType(LayoutType::ncsp)) {
-        parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
+        cpu_parallel->parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
             const int gc = (c * groupSize + h) * groupSize + w;
             const int outputBlockResidual = (dstDesc.hasLayoutType(LayoutType::ncsp) ? 0 : c % inBlockSize);
             const int outputBlockIdx = (c / outBlockSize) * outBlockSize;
@@ -356,7 +356,7 @@ void PSROIPooling::executeAverage(const inputType* srcData,
             avgPsroi(c, h, w, 0, outputBlockResidual, binOffsetInput, binOffsetOutput);
         });
     } else {  // nChw16c, nChw8c
-        parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) {
+        cpu_parallel->parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) {
             int cStart = blkIdx * outBlockSize;
             int cEnd = (blkIdx == outBlockCount - 1 ? nc : cStart + outBlockSize);
             for (int c = cStart; c < cEnd; c++) {
@@ -381,6 +381,7 @@ void PSROIPooling::executeBilinear(const inputType* srcData,
                                    const int roiBatchInd,
                                    const BlockedMemoryDesc& srcDesc,
                                    const BlockedMemoryDesc& dstDesc) {
+    const auto& cpu_parallel = context->getCpuParallel();
     int inBlockSize = 0;
     int outBlockSize = 0;
     int outBlockCount = 0;
@@ -488,17 +489,17 @@ void PSROIPooling::executeBilinear(const inputType* srcData,
 
     if (srcDesc.hasLayoutType(LayoutType::nspc)) {
         const int binOffsetOutput = currentRoi * nc * nh * nw;
-        parallel_for2d(nh, nw, [&](int h, int w) {
+        cpu_parallel->parallel_for2d(nh, nw, [&](int h, int w) {
             for (int c = 0; c < nc; c++) {
                 bilinearPsroi(c, h, w, 0, binOffsetOutput + c);
             }
         });
     } else if (srcDesc.hasLayoutType(LayoutType::ncsp)) {
-        parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
+        cpu_parallel->parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
             bilinearPsroi(c, h, w, 0, (currentRoi * outputChannelsPadding + c) * binCount);
         });
     } else {  // nChw16c, nChw8c
-        parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) {
+        cpu_parallel->parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) {
             int cStart = blkIdx * outBlockSize;
             int cEnd = (blkIdx == outBlockCount - 1 ? nc : cStart + outBlockSize);
             for (int c = cStart; c < cEnd; c++) {
@@ -523,6 +524,7 @@ void PSROIPooling::executeBilinearDeformable(const inputType* srcData,
                                              const int channelsEachClass,
                                              const int currentRoi,
                                              const int roiBatchInd) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const float roiStartW = round(bottomRois[1]) * spatialScale - 0.5F;
     const float roiStartH = round(bottomRois[2]) * spatialScale - 0.5F;
     const float roiEndW = (round(bottomRois[3]) + 1.0F) * spatialScale - 0.5F;
@@ -530,7 +532,7 @@ void PSROIPooling::executeBilinearDeformable(const inputType* srcData,
     // Force too small ROIs to be 1x1
     const float roiWidth = std::max<float>(roiEndW - roiStartW, 0.1F);  // avoid 0
     const float roiHeight = std::max<float>(roiEndH - roiStartH, 0.1F);
-    parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
+    cpu_parallel->parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
         size_t dstIndex = ((currentRoi * nc + c) * nh + h) * nw + w;
         dstData[dstIndex] = 0;
         // Compute w and h at bottom
@@ -587,6 +589,7 @@ void PSROIPooling::executeBilinearDeformable(const inputType* srcData,
 
 template <typename inputType, typename outputType>
 void PSROIPooling::executeSpecified() {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* srcData = getSrcDataAtPortAs<const inputType>(0);
     const auto* bottomRoisBeginning = getSrcDataAtPortAs<const float>(1);
     auto* dstData = getDstDataAtPortAs<outputType>(0);
@@ -613,7 +616,7 @@ void PSROIPooling::executeSpecified() {
         channelsEachClass /= numClasses;
     }
 
-    parallel_for(realRois, [&](int currentRoi) {
+    cpu_parallel->parallel_for(realRois, [&](int currentRoi) {
         const float* bottomRois = bottomRoisBeginning + currentRoi * 5;
         auto roiBatchInd = static_cast<int>(bottomRois[0]);
         if (getAlgorithm() == Algorithm::PSROIPoolingAverage) {
diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp
index 422c5ed2619e0c..67ac320e267e03 100644
--- a/src/plugins/intel_cpu/src/nodes/reduce.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp
@@ -2551,13 +2551,14 @@ void Reduce::reduce_type(const uint8_t* in_ptr, uint8_t* out_ptr) {
 }
 
 void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
+    const auto& cpu_parallel = context->getCpuParallel();
     output_info_reassign(&out_ptr);
     init_dst_data(out_ptr, dst_size);
 
     if (ReduceN && !ReduceC && !ReduceD && !ReduceH && !ReduceW) {
         size_t IA = IC * ID * IH * IW;
         reduce_stride = IA;
-        parallel_for(IA / blk_size, [&](size_t iba) {
+        cpu_parallel->parallel_for(IA / blk_size, [&](size_t iba) {
             size_t oba = iba;
             reduce_kernel_process(in_ptr + iba * blk_size * src_data_size,
                                   out_ptr + oba * blk_size * dst_data_size,
@@ -2587,7 +2588,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     for (size_t i = 0; i < blk_size; i++) {
                         index_buf[i] = i * work_amount * src_data_size;
                     }
-                    parallel_for(IK, [&](size_t ik) {
+                    cpu_parallel->parallel_for(IK, [&](size_t ik) {
                         size_t ok = ik;
                         reduce_kernel_process(in_ptr_n + ik * blk_size * inner_size * src_data_size,
                                               out_ptr_n + ok * blk_size * output_inner_size * dst_data_size,
@@ -2598,7 +2599,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     });
                     size_t tail_start = IK * blk_size;
                     size_t IT = outer_size - tail_start;
-                    parallel_for(IT, [&](size_t it) {
+                    cpu_parallel->parallel_for(IT, [&](size_t it) {
                         size_t ot = it;
                         reduce_kernel_process(in_ptr_n + (tail_start + it) * inner_size * src_data_size,
                                               out_ptr_n + (tail_start + ot) * output_inner_size * dst_data_size,
@@ -2607,14 +2608,14 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     });
                 } else {
                     if (ReduceH) {
-                        parallel_for2d(IC, ID, [&](size_t ic, size_t id) {
+                        cpu_parallel->parallel_for2d(IC, ID, [&](size_t ic, size_t id) {
                             size_t oc = ic;
                             size_t od = id;
                             GET_PTR_NCD_BASE_PTR_N_PLN;
                             reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, work_amount, 1);
                         });
                     } else {
-                        parallel_for3d(IC, ID, IH, [&](size_t ic, size_t id, size_t ih) {
+                        cpu_parallel->parallel_for3d(IC, ID, IH, [&](size_t ic, size_t id, size_t ih) {
                             size_t oc = ic;
                             size_t od = id;
                             GET_PTR_NCD_BASE_PTR_N_PLN;
@@ -2642,7 +2643,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     init_dst_data(prc_ptr_n, prc_size);
                     size_t IS = IH * IW;
                     reduce_stride = IS;
-                    parallel_for(IS / blk_size, [&](size_t ibs) {
+                    cpu_parallel->parallel_for(IS / blk_size, [&](size_t ibs) {
                         size_t pbs = ibs;
                         reduce_kernel_process(in_ptr_n + ibs * blk_size * src_data_size,
                                               prc_ptr_n + pbs * blk_size * prc_data_size,
@@ -2658,7 +2659,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                                           IC * ID);
                     // step2: ReduceW
                     reduce_kernel_reassign();
-                    parallel_for(PH, [&](size_t ph) {
+                    cpu_parallel->parallel_for(PH, [&](size_t ph) {
                         size_t oh = ph;
                         reduce_kernel_process(prc_ptr_n + ph * PW * prc_data_size,
                                               out_ptr_n + oh * OW * dst_data_size,
@@ -2673,7 +2674,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                         for (size_t id = 0; id < ID; id++) {
                             size_t od = ReduceD ? 0 : id;
                             GET_PTR_NCD_PLN;
-                            parallel_for(IH, [&](size_t ih) {
+                            cpu_parallel->parallel_for(IH, [&](size_t ih) {
                                 size_t oh = ih;
                                 GET_PTR_NCDH_PLN;
                                 reduce_kernel_process(in_ptr_ncdh, out_ptr_ncdh, IW, 1);
@@ -2696,11 +2697,11 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     }
                 }
             } else if (!ReduceC && !ReduceD && ReduceH && !ReduceW) {
-                parallel_for2d(IC, ID, [&](size_t ic, size_t id) {
+                cpu_parallel->parallel_for2d(IC, ID, [&](size_t ic, size_t id) {
                     size_t oc = ic;
                     size_t od = id;
                     GET_PTR_NCD_BASE_PTR_N_PLN;
-                    parallel_for(IW / blk_size, [&](size_t ibw) {
+                    cpu_parallel->parallel_for(IW / blk_size, [&](size_t ibw) {
                         size_t obw = ibw;
                         reduce_kernel_process(in_ptr_ncd + ibw * blk_size * src_data_size,
                                               out_ptr_ncd + obw * blk_size * dst_data_size,
@@ -2723,7 +2724,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                         // step1: !ReduceD && ReduceH && !ReduceW
                         uint8_t* prc_ptr_n = vec_reduceDH_prc.data();
                         init_dst_data(prc_ptr_n, prc_size);
-                        parallel_for2d(ID, IWB, [&](size_t id, size_t iwb) {
+                        cpu_parallel->parallel_for2d(ID, IWB, [&](size_t id, size_t iwb) {
                             size_t pd = id;
                             size_t pwb = iwb;
                             reduce_kernel_process(in_ptr_n + (id * IH * IW + iwb * blk_size) * src_data_size,
@@ -2735,7 +2736,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                         // step2: ReduceD
                         reduce_stride = PW;
                         reduce_kernel_reassign();
-                        parallel_for(IWB, [&](size_t iwb) {
+                        cpu_parallel->parallel_for(IWB, [&](size_t iwb) {
                             size_t pwb = iwb;
                             size_t owb = iwb;
                             reduce_kernel_process(prc_ptr_n + pwb * blk_size * prc_data_size,
@@ -2749,7 +2750,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     // reduce tail
                     reduce_stride = IW;
                     size_t tail_start = IWB * blk_size;
-                    parallel_for(IW - tail_start, [&](size_t i_tail) {
+                    cpu_parallel->parallel_for(IW - tail_start, [&](size_t i_tail) {
                         reduce_kernel_process(in_ptr_n + (tail_start + i_tail) * src_data_size,
                                               out_ptr_n + (tail_start + i_tail) * dst_data_size,
                                               1,
@@ -2757,10 +2758,10 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                                               ID * IH);
                     });
                 } else {
-                    parallel_for(IC, [&](size_t ic) {
+                    cpu_parallel->parallel_for(IC, [&](size_t ic) {
                         size_t oc = ic;
                         GET_PTR_NC_PLN;
-                        parallel_for(IWB, [&](size_t iwb) {
+                        cpu_parallel->parallel_for(IWB, [&](size_t iwb) {
                             size_t owb = iwb;
                             reduce_kernel_process(in_ptr_nc + iwb * blk_size * src_data_size,
                                                   out_ptr_nc + owb * blk_size * dst_data_size,
@@ -2769,7 +2770,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                                                   ID * IH);
                         });
                         size_t tail_start = IWB * blk_size;
-                        parallel_for(IW - tail_start, [&](size_t i_tail) {
+                        cpu_parallel->parallel_for(IW - tail_start, [&](size_t i_tail) {
                             reduce_kernel_process(in_ptr_nc + (tail_start + i_tail) * src_data_size,
                                                   out_ptr_nc + (tail_start + i_tail) * dst_data_size,
                                                   1,
@@ -2779,7 +2780,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     });
                 }
             } else if (ReduceC && ReduceD && ReduceH && !ReduceW) {
-                parallel_for(IW / blk_size, [&](size_t ibw) {
+                cpu_parallel->parallel_for(IW / blk_size, [&](size_t ibw) {
                     size_t obw = ibw;
                     reduce_kernel_process(in_ptr_n + ibw * blk_size * src_data_size,
                                           out_ptr_n + obw * blk_size * dst_data_size,
@@ -2797,7 +2798,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
             } else if (ReduceC && !ReduceD && !ReduceH && !ReduceW) {
                 size_t IS = ID * IH * IW;
                 reduce_stride = IS;
-                parallel_for(IS / blk_size, [&](size_t ibs) {
+                cpu_parallel->parallel_for(IS / blk_size, [&](size_t ibs) {
                     size_t obs = ibs;
                     reduce_kernel_process(in_ptr_n + ibs * blk_size * src_data_size,
                                           out_ptr_n + obs * blk_size * dst_data_size,
@@ -2846,6 +2847,7 @@ void Reduce::reduce_PLN(const uint8_t* in_ptr, uint8_t* out_ptr) {
 }
 
 void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) {
+    const auto& cpu_parallel = context->getCpuParallel();
     size_t ICB = div_up(IC, blk_size);
     size_t OCB = div_up(OC, blk_size);
     output_info_reassign(&out_ptr);
@@ -2859,7 +2861,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) {
                 apply_division = getAlgorithm() == Algorithm::ReduceMean && attr.get()->post_ops_.len() == 0;
                 apply_post_kernel = !apply_division;
             }
-            parallel_for2d(ICB, ID, [&](size_t icb, size_t id) {
+            cpu_parallel->parallel_for2d(ICB, ID, [&](size_t icb, size_t id) {
                 size_t ocb = icb;
                 size_t od = id;
                 GET_PTR_NCD_BASE_PTR_N_BLK;
@@ -2874,7 +2876,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) {
                 init_dst_data(vec_prc.data(), prc_size);
                 uint8_t* out_ptr_n_cp = out_ptr_n;
                 out_ptr_n = vec_prc.data();
-                parallel_for(ICB, [&](size_t icb) {
+                cpu_parallel->parallel_for(ICB, [&](size_t icb) {
                     size_t ocb = icb;
                     GET_PTR_NC_BLK;
                     reduce_kernel_process(in_ptr_nc, out_ptr_nc, ID * IH * IW * blk_size);
@@ -2902,7 +2904,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) {
             }
         } else if (ReduceC && !ReduceD && !ReduceH && !ReduceW) {
             reduce_stride = ID * IH * IW * blk_size;
-            parallel_for3d(ID, IH, IW, [&](size_t id, size_t ih, size_t iw) {
+            cpu_parallel->parallel_for3d(ID, IH, IW, [&](size_t id, size_t ih, size_t iw) {
                 size_t icb = 0;
                 size_t ocb = 0;
                 GET_PTR_NC_BLK;
@@ -2924,7 +2926,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) {
                     for (size_t ih = 0; ih < IH; ih++) {
                         size_t oh = ReduceH ? 0 : ih;
                         GET_PTR_NCDH_BLK;
-                        parallel_for(IW, [&](size_t iw) {
+                        cpu_parallel->parallel_for(IW, [&](size_t iw) {
                             size_t ow = iw;
                             GET_PTR_NCDHW_BLK;
                             reduce_kernel_process(in_ptr_ncdhw, out_ptr_ncdhw, blk_size);
@@ -2942,6 +2944,7 @@ void Reduce::reduce_BLK(const uint8_t* in_ptr, uint8_t* out_ptr) {
 }
 
 void Reduce::reduce_BLK_concern_padding(const uint8_t* in_ptr, uint8_t* out_ptr) {
+    const auto& cpu_parallel = context->getCpuParallel();
     size_t ICB = div_up(IC, blk_size);
     size_t OCB = div_up(OC, blk_size);
     output_info_reassign(&out_ptr);
@@ -2968,7 +2971,7 @@ void Reduce::reduce_BLK_concern_padding(const uint8_t* in_ptr, uint8_t* out_ptr)
                 size_t ocb = 0;
                 ;
                 size_t ic = icb * blk_size;
-                parallel_for(ID, [&](size_t id) {
+                cpu_parallel->parallel_for(ID, [&](size_t id) {
                     size_t od = id;
                     GET_PTR_NCD_BASE_PTR_N_BLK;
                     if (ic + blk_size <= IC) {
@@ -3024,7 +3027,7 @@ void Reduce::reduce_BLK_concern_padding(const uint8_t* in_ptr, uint8_t* out_ptr)
                         for (size_t ih = 0; ih < IH; ih++) {
                             size_t oh = ReduceH ? 0 : ih;
                             GET_PTR_NCDH_BLK;
-                            parallel_for(IW, [&](size_t iw) {
+                            cpu_parallel->parallel_for(IW, [&](size_t iw) {
                                 size_t ow = iw;
                                 GET_PTR_NCDHW_BLK;
                                 reduce_kernel_process(in_ptr_ncdhw, out_ptr_ncdhw, blk_size);
@@ -3064,11 +3067,12 @@ inline void Reduce::reduce_kernel_process(const uint8_t* in_p,
 }
 
 inline void Reduce::reduce_kernel_post_process(uint8_t* out_ptr) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const uint8_t* in_ptr = fuse_low_precision ? static_cast<uint8_t*>(intermediate_buf.data()) : nullptr;
     const size_t integerDivisor = empty_input ? 1 : IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
     const auto divisor = static_cast<float>(integerDivisor);
     if (layout == ReduceLayoutType::reduce_ncsp) {
-        parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
+        cpu_parallel->parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
             const uint8_t* in_p = in_ptr + (ob * OC + oc) * OD * OH * OW * intermediate_data_size;
             uint8_t* out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size;
             auto arg = jit_reduce_post_call_args();
@@ -3107,7 +3111,7 @@ inline void Reduce::reduce_kernel_post_process(uint8_t* out_ptr) {
         });
     } else {
         size_t OCB = div_up(OC, blk_size);
-        parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {
+        cpu_parallel->parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {
             const uint8_t* in_p = in_ptr + (ob * OCB + ocb) * OD * OH * OW * blk_size * intermediate_data_size;
             uint8_t* out_p = out_ptr + (ob * OCB + ocb) * OD * OH * OW * blk_size * dst_data_size;
             auto arg = jit_reduce_post_call_args();
@@ -3171,11 +3175,12 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
     const size_t DIM4 = OH;
     const size_t stride1 = DIM2 * DIM3 * DIM4;
     const size_t stride0 = stride1 * DIM1;
+    const auto& cpu_parallel = context->getCpuParallel();
 
     if (dst_data_size == 4) {
         const auto* src_data = reinterpret_cast<const float*>(proc_ptr);
         auto* dst_data = reinterpret_cast<float*>(out_ptr);
-        parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
+        cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
             auto src_off = b * stride0 + j * DIM1;
             auto dst_off = b * stride0 + j;
             for (size_t dim1 = 0; dim1 < DIM1; dim1++) {
@@ -3187,7 +3192,7 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
     } else if (dst_data_size == 2) {
         const auto* src_data = reinterpret_cast<const uint16_t*>(proc_ptr);
         auto* dst_data = reinterpret_cast<uint16_t*>(out_ptr);
-        parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
+        cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
             auto src_off = b * stride0 + j * DIM1;
             auto dst_off = b * stride0 + j;
             for (size_t dim1 = 0; dim1 < DIM1; dim1++) {
@@ -3199,7 +3204,7 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
     } else {
         const auto* src_data = proc_ptr;
         auto* dst_data = out_ptr;
-        parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
+        cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
             auto src_off = b * stride0 + j * DIM1;
             auto dst_off = b * stride0 + j;
             for (size_t dim1 = 0; dim1 < DIM1; dim1++) {
@@ -3212,6 +3217,7 @@ void Reduce::nspc2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
 }
 
 void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
+    const auto& cpu_parallel = context->getCpuParallel();
     const size_t DIM0 = OB;
     const size_t DIM1 = OC;
     const size_t DIM2 = OD;
@@ -3224,7 +3230,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
     if (dst_data_size == 4) {
         const auto* src_data = reinterpret_cast<const float*>(proc_ptr);
         auto* dst_data = reinterpret_cast<float*>(out_ptr);
-        parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
+        cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
             auto src_off = b * src_stride0 + j * blk_size;
             auto dst_off = b * dst_stride0 + j;
             for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) {
@@ -3245,7 +3251,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
     } else if (dst_data_size == 2) {
         const auto* src_data = reinterpret_cast<const uint16_t*>(proc_ptr);
         auto* dst_data = reinterpret_cast<uint16_t*>(out_ptr);
-        parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
+        cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
             auto src_off = b * src_stride0 + j * blk_size;
             auto dst_off = b * dst_stride0 + j;
             for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) {
@@ -3266,7 +3272,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
     } else {
         const auto* src_data = proc_ptr;
         auto* dst_data = out_ptr;
-        parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
+        cpu_parallel->parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
             auto src_off = b * src_stride0 + j * blk_size;
             auto dst_off = b * dst_stride0 + j;
             for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) {
@@ -3288,6 +3294,7 @@ void Reduce::blocked2ncsp(const uint8_t* proc_ptr, uint8_t* out_ptr) const {
 }
 
 inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) {
+    const auto& cpu_parallel = context->getCpuParallel();
     switch (algorithm) {
     case Algorithm::ReduceL1:
     case Algorithm::ReduceL2:
@@ -3303,32 +3310,32 @@ inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) {
     case Algorithm::ReduceProd:
         if (output_prec == ov::element::f32) {
             auto* out_p = reinterpret_cast<float*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = static_cast<float>(1);
             });
         } else if (output_prec == ov::element::i32) {
             auto* out_p = reinterpret_cast<int32_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = static_cast<int32_t>(1);
             });
         } else if (output_prec == ov::element::bf16) {
             auto* out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = static_cast<bfloat16_t>(1);
             });
         } else if (output_prec == ov::element::f16) {
             auto* out_p = reinterpret_cast<ov::float16*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = static_cast<ov::float16>(1);
             });
         } else if (output_prec == ov::element::u8) {
             auto* out_p = out_ptr;
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = static_cast<uint8_t>(1);
             });
         } else if (output_prec == ov::element::i8) {
             auto* out_p = reinterpret_cast<int8_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = static_cast<int8_t>(1);
             });
         }
@@ -3336,32 +3343,32 @@ inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) {
     case Algorithm::ReduceMax:
         if (output_prec == ov::element::f32) {
             auto* out_p = reinterpret_cast<float*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<float>::lowest();
             });
         } else if (output_prec == ov::element::i32) {
             auto* out_p = reinterpret_cast<int32_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<int32_t>::min();
             });
         } else if (output_prec == ov::element::bf16) {
             auto* out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<bfloat16_t>::lowest();
             });
         } else if (output_prec == ov::element::f16) {
             auto* out_p = reinterpret_cast<ov::float16*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<ov::float16>::lowest();
             });
         } else if (output_prec == ov::element::u8) {
             auto* out_p = out_ptr;
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<uint8_t>::min();
             });
         } else if (output_prec == ov::element::i8) {
             auto* out_p = reinterpret_cast<int8_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<int8_t>::min();
             });
         }
@@ -3369,32 +3376,32 @@ inline void Reduce::init_dst_data(uint8_t* out_ptr, size_t dst_size) {
     case Algorithm::ReduceMin:
         if (output_prec == ov::element::f32) {
             auto* out_p = reinterpret_cast<float*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<float>::max();
             });
         } else if (output_prec == ov::element::i32) {
             auto* out_p = reinterpret_cast<int32_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<int32_t>::max();
             });
         } else if (output_prec == ov::element::bf16) {
             auto* out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<bfloat16_t>::max();
             });
         } else if (output_prec == ov::element::f16) {
             auto* out_p = reinterpret_cast<ov::float16*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<ov::float16>::max();
             });
         } else if (output_prec == ov::element::u8) {
             auto* out_p = out_ptr;
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<uint8_t>::max();
             });
         } else if (output_prec == ov::element::i8) {
             auto* out_p = reinterpret_cast<int8_t*>(out_ptr);
-            parallel_for(dst_size / dst_data_size, [&](size_t i) {
+            cpu_parallel->parallel_for(dst_size / dst_data_size, [&](size_t i) {
                 out_p[i] = std::numeric_limits<int8_t>::max();
             });
         }
@@ -3710,6 +3717,7 @@ void Reduce::reduce_ref_process(const float* in_ptr,
 }
 
 inline void Reduce::reduce_ref_map(float* out_ptr, size_t work_amount_dst, size_t reduced_dims_work_amount) {
+    const auto& cpu_parallel = context->getCpuParallel();
     switch (algorithm) {
     case Algorithm::ReduceAnd:
     case Algorithm::ReduceL1:
@@ -3721,18 +3729,18 @@ inline void Reduce::reduce_ref_map(float* out_ptr, size_t work_amount_dst, size_
     case Algorithm::ReduceSumSquare:
         break;
     case Algorithm::ReduceL2:
-        parallel_for(work_amount_dst, [&](size_t i) {
+        cpu_parallel->parallel_for(work_amount_dst, [&](size_t i) {
             out_ptr[i] = std::sqrt(out_ptr[i]);
         });
         break;
     case Algorithm::ReduceLogSum:
     case Algorithm::ReduceLogSumExp:
-        parallel_for(work_amount_dst, [&](size_t i) {
+        cpu_parallel->parallel_for(work_amount_dst, [&](size_t i) {
             out_ptr[i] = logf(out_ptr[i]);
         });
         break;
     case Algorithm::ReduceMean:
-        parallel_for(work_amount_dst, [&](size_t i) {
+        cpu_parallel->parallel_for(work_amount_dst, [&](size_t i) {
             out_ptr[i] /= reduced_dims_work_amount;
         });
         break;
diff --git a/src/plugins/intel_cpu/src/nodes/region_yolo.cpp b/src/plugins/intel_cpu/src/nodes/region_yolo.cpp
index b295b4e93615ae..22bd04af6c9492 100644
--- a/src/plugins/intel_cpu/src/nodes/region_yolo.cpp
+++ b/src/plugins/intel_cpu/src/nodes/region_yolo.cpp
@@ -24,7 +24,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/region_yolo.hpp"
@@ -392,10 +391,11 @@ inline float RegionYolo::logistic_scalar(float src) {
 }
 
 inline void RegionYolo::calculate_logistic(size_t start_index, int count, uint8_t* dst_data) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto dst_data_size = output_prec.size();
     if (logistic_kernel) {
         int blocks_num = div_up(count, block_size);
-        parallel_for(blocks_num, [&](int ib) {
+        cpu_parallel->parallel_for(blocks_num, [&](int ib) {
             int idx = ib * block_size;
             int work_amount = std::min(count - idx, block_size);
 
diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp
index 7a275171d08447..99e4a705069529 100644
--- a/src/plugins/intel_cpu/src/nodes/reorder.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp
@@ -38,8 +38,8 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type/element_type.hpp"
+#include "thread_pool_imp.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/general_utils.h"
 
@@ -353,6 +353,7 @@ bool Reorder::created() const {
 }
 
 void Reorder::optimizedNcsp2Nspc() {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto parentEdge = getParentEdgeAt(0);
     auto childEdge = getChildEdgeAt(0);
 
@@ -374,7 +375,7 @@ void Reorder::optimizedNcsp2Nspc() {
     const size_t stride1 = DIM2 * DIM3 * DIM4;
     const size_t stride2 = DIM2 * DIM3;
 
-    parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) {
+    cpu_parallel->parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) {
         size_t src_off = dim0 * src_batch_stride + j * DIM4 + dim1 * stride1;
         size_t dst_off = dim0 * dst_batch_stride + j * DIM4 * dst_channel_stride + dim1;
 
@@ -387,6 +388,7 @@ void Reorder::optimizedNcsp2Nspc() {
 }
 
 void Reorder::optimizedNspc2Ncsp() {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto parentEdge = getParentEdgeAt(0);
     auto childEdge = getChildEdgeAt(0);
 
@@ -405,7 +407,7 @@ void Reorder::optimizedNspc2Ncsp() {
     const size_t block_size = DIM2 * DIM3 * DIM4;
     const size_t src_batch_stride = block_size * DIM1;
     const size_t dst_batch_stride = dstStrides[0];
-    parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) {
+    cpu_parallel->parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) {
         auto src_off = b * src_batch_stride + j * DIM1;
         auto dst_off = b * dst_batch_stride + j;
         for (size_t dim1 = 0; dim1 < DIM1; ++dim1) {
@@ -468,7 +470,10 @@ std::string Reorder::getReorderArgs(const MemoryDesc& parentDesc, const MemoryDe
     return inArgs + "_" + outArgs;
 }
 
-void Reorder::reorderData(const IMemory& input, const IMemory& output, const MultiCachePtr& cache) {
+void Reorder::reorderData(const IMemory& input,
+                          const IMemory& output,
+                          const MultiCachePtr& cache,
+                          const std::shared_ptr<ThreadPool>& threadPool) {
     OPENVINO_ASSERT(input.getDesc().isDefined() && output.getDesc().isDefined(),
                     "Can't reorder data with dynamic shapes");
 
@@ -541,7 +546,7 @@ void Reorder::reorderData(const IMemory& input, const IMemory& output, const Mul
                             output.getDesc().serializeFormat());
         }
         if (reorder) {
-            dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
+            dnnl::stream loc_stream = make_stream(engine, threadPool);
             reorder.execute(loc_stream, {{DNNL_ARG_FROM, srcMemory}, {DNNL_ARG_TO, dstMemory}});
         } else {
             OPENVINO_THROW("Could not make onednn reorder.");
diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h
index 6c444d41eb7fb8..477278a4b15a51 100644
--- a/src/plugins/intel_cpu/src/nodes/reorder.h
+++ b/src/plugins/intel_cpu/src/nodes/reorder.h
@@ -71,7 +71,10 @@ class Reorder : public Node {
 
     static std::string getReorderArgs(const MemoryDesc& parentDesc, const MemoryDesc& childDesc);
 
-    static void reorderData(const IMemory& input, const IMemory& output, const MultiCachePtr& cache = nullptr);
+    static void reorderData(const IMemory& input,
+                            const IMemory& output,
+                            const MultiCachePtr& cache = nullptr,
+                            const std::shared_ptr<ThreadPool>& threadPool = nullptr);
 
 private:
     dnnl::reorder::primitive prim;
diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp
index 74bead6998c3ae..410c98b549984a 100644
--- a/src/plugins/intel_cpu/src/nodes/rnn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp
@@ -40,7 +40,6 @@
 #include "openvino/core/coordinate_diff.hpp"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/core/type/element_type_traits.hpp"
@@ -924,6 +923,7 @@ void RNN::fillSequenceDesc() {
 template <element::Type_t ET>
 void RNN::fillWeights() {
     using DataType = typename element_type_traits<ET>::value_type;
+    const auto& cpu_parallel = context->getCpuParallel();
     CPU_NODE_ASSERT(getParentEdgeAt(wIdx)->getParent()->getType() == Type::Input, "expects Constant for port ", wIdx);
     auto w_const_blob = static_cast<Input*>(getParentEdgeAt(wIdx)->getParent().get())->getMemoryPtr();
     CPU_NODE_ASSERT(getParentEdgeAt(rIdx)->getParent()->getType() == Type::Input, "expects Constant for port ", rIdx);
@@ -956,7 +956,7 @@ void RNN::fillWeights() {
 
         const uint64_t step = SC * G;
         const uint64_t SC_DC = SC * DC;
-        parallel_for2d(G, SC, [&](size_t g, size_t out_i) {
+        cpu_parallel->parallel_for2d(G, SC, [&](size_t g, size_t out_i) {
             DataType* l_w_ptr = w_ptr + m_gate_map[g] * SC + out_i;
             DataType* s_w_ptr = ie_w_ptr + out_i * DC + g * SC_DC;
             for (size_t in_i = 0; in_i < DC; in_i++) {
@@ -992,7 +992,7 @@ void RNN::fillWeights() {
 
         const uint64_t step = SC * G;
         const uint64_t SC_2 = SC * SC;
-        parallel_for2d(G, SC, [&](size_t g, size_t out_i) {
+        cpu_parallel->parallel_for2d(G, SC, [&](size_t g, size_t out_i) {
             DataType* l_r_ptr = r_ptr + m_gate_map[g] * SC + out_i;
             DataType* s_r_ptr = ie_r_ptr + out_i * SC + g * SC_2;
             for (size_t in_i = 0; in_i < SC; in_i++) {
@@ -1024,6 +1024,7 @@ void RNN::fillWeights() {
 template <element::Type_t ET>
 void RNN::fillBiases() {
     using DataType = typename element_type_traits<ET>::value_type;
+    const auto& cpu_parallel = context->getCpuParallel();
 
     CPU_NODE_ASSERT(getParentEdgeAt(bIdx)->getParent()->getType() == Type::Input, "expects Constant for port ", bIdx);
     auto b_const_blob = static_cast<Input*>(getParentEdgeAt(bIdx)->getParent().get())->getMemoryPtr();
@@ -1062,7 +1063,7 @@ void RNN::fillBiases() {
         }
 
         const uint64_t step = SC * sizeof(DataType);
-        parallel_for(Gb, [&](size_t g) {
+        cpu_parallel->parallel_for(Gb, [&](size_t g) {
             DataType* l_b_ptr = b_ptr + m_gate_map[g] * SC;
             const DataType* l_ie_b_ptr = ie_b_ptr + g * SC;
             cpu_memcpy(l_b_ptr, l_ie_b_ptr, step);
@@ -1087,7 +1088,10 @@ void RNN::prepareMemory(const DnnlMemoryDescPtr& new_desc, size_t idx) {
     auto create = [&]() {
         Memory memory{getEngine(), m_initial_weights[idx]->getDescPtr(), m_initial_weights[idx]->getData()};
         MemoryPtr res_ptr = std::make_shared<Memory>(getEngine(), new_desc);
-        node::Reorder::reorderData(memory, *res_ptr, context->getParamsCache());
+        node::Reorder::reorderData(memory,
+                                   *res_ptr,
+                                   context->getParamsCache(),
+                                   context->getCpuParallel()->get_thread_pool());
         return res_ptr;
     };
 
diff --git a/src/plugins/intel_cpu/src/nodes/roi_align.cpp b/src/plugins/intel_cpu/src/nodes/roi_align.cpp
index 911294f8c03df6..f9c063dd8be291 100644
--- a/src/plugins/intel_cpu/src/nodes/roi_align.cpp
+++ b/src/plugins/intel_cpu/src/nodes/roi_align.cpp
@@ -922,6 +922,7 @@ void ROIAlign::execute([[maybe_unused]] const dnnl::stream& strm) {
 
 template <typename inputType, typename outputType>
 void ROIAlign::executeSpecified() {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
     const auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
     const auto& dstMemory = getChildEdgeAt(0)->getMemory();
@@ -993,7 +994,7 @@ void ROIAlign::executeSpecified() {
     }
     }
 
-    parallel_for(realRois, [&](size_t n) {
+    cpu_parallel->parallel_for(realRois, [&](size_t n) {
         int roiOff = n * 4;
         const float* srcRoiPtr = &srcRoi[roiOff];
         int roiBatchInd = srcRoiIdx[n];
@@ -1165,7 +1166,7 @@ void ROIAlign::executeSpecified() {
             });
         } else {
             // one lane for one sample generation, then pooling all samples.
-            parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) {
+            cpu_parallel->parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) {
                 size_t batchSrcOffset = srcRoiIdx[n] * batchInputStride;
                 size_t channelSrcOffset = batchSrcOffset + cIdx * H * W;
                 size_t binOffset = yBinInd * pooledW + xBinInd;
@@ -1187,7 +1188,7 @@ void ROIAlign::executeSpecified() {
         }
     } else {
         // ref with planar
-        parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) {
+        cpu_parallel->parallel_for4d(realRois, C, pooledH, pooledW, [&](int n, int cIdx, int yBinInd, int xBinInd) {
             int numSamplesROI = numSamples[n];
             size_t batchSrcOffset = srcRoiIdx[n] * batchInputStride;
             size_t channelSrcOffset = batchSrcOffset + cIdx * H * W;
diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
index c0dcf1b74aca65..ba0c226affb979 100644
--- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
@@ -1918,6 +1918,7 @@ void ScaledDotProductAttention::updateBeamTable(const MemoryPtr& mem_beam_idx, s
 
 // Update pastkv using cur_k, cur_v, simply append cur_k, cur_v to the end of pastkv in the state.
 void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const MemoryPtr& mem_cur_v) {
+    const auto& cpu_parallel = context->getCpuParallel();
     // L, B, H, S -> [2, 0, 1, 3] -> B, H, L, S
     std::vector<size_t> order = {0, 1, 2, 3};
     if (!m_config.config.permute_axes.empty()) {
@@ -2019,14 +2020,14 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
                     [&](const SDPAQuantParam& quant_param, PlainTensor& new_scale_zp, PlainTensor& old_scale_zp) {
                         if (quant_param.isByChannel) {
                             size_t group_nums = div_up(L0, quant_param.groupSize) * 2;
-                            parallel_for(group_nums, [&](size_t m) {
+                            cpu_parallel->parallel_for(group_nums, [&](size_t m) {
                                 memcpy(new_scale_zp.ptr<float>(m),
                                        old_scale_zp.ptr<float>(m),
                                        sizeof(float) * old_scale_zp.m_dims[1] * old_scale_zp.m_dims[2] *
                                            old_scale_zp.m_dims[3]);
                             });
                         } else {
-                            parallel_for(L0, [&](size_t m) {
+                            cpu_parallel->parallel_for(L0, [&](size_t m) {
                                 memcpy(new_scale_zp.ptr<float>(m),
                                        old_scale_zp.ptr<float>(m),
                                        sizeof(float) * old_scale_zp.m_dims[1] * old_scale_zp.m_dims[2] *
diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp
index 85195237af4317..811b076a283239 100644
--- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp
@@ -985,6 +985,7 @@ void ScatterUpdate::execute([[maybe_unused]] const dnnl::stream& strm) {
 // and indices tensor of shape [i_0, i_1, ..., i_k].
 // Updates tensor shape should be [d_0, d_1, ... d_(axis - 1), i_0, i_1, ..., i_k, d_(axis + 1), ..., d_n].
 void ScatterUpdate::scatterUpdate(uint8_t* indices, uint8_t* update, int axis, uint8_t* dstData) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto& srcDataDim = getParentEdgeAt(DATA_ID)->getMemory().getStaticDims();
     const auto& indicesDim = getParentEdgeAt(INDICES_ID)->getMemory().getStaticDims();
     const auto& updateDim = getParentEdgeAt(UPDATE_ID)->getMemory().getStaticDims();
@@ -1006,7 +1007,7 @@ void ScatterUpdate::scatterUpdate(uint8_t* indices, uint8_t* update, int axis, u
     size_t blockToUpdate = srcBlockND[axis + 1];
     size_t blockToUpdateSize = blockToUpdate * dataSize;
 
-    parallel_for2d(batchToUpdate, idxLength, [&](size_t b, size_t idx) {
+    cpu_parallel->parallel_for2d(batchToUpdate, idxLength, [&](size_t b, size_t idx) {
         int64_t idxValue = getIndicesValue(indices, idx);
         uint8_t* dstEntry = dstData + (b * srcBlockND[axis] + idxValue * blockToUpdate) * dataSize;
         uint8_t* updateEntry = update + (b * updateBlockND[axis] + idx * blockToUpdate) * dataSize;
@@ -1038,6 +1039,7 @@ void ScatterUpdate::scatterNDUpdate(const MemoryPtr& mem_data,
                                     const MemoryPtr& mem_indices,
                                     const MemoryPtr& mem_updates,
                                     [[maybe_unused]] const scatter_reductions::ReduceNone& kernel) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto* indices = mem_indices->getDataAs<uint8_t>();
     auto* update = mem_updates->getDataAs<uint8_t>();
     auto* dstData = mem_data->getDataAs<uint8_t>();
@@ -1055,7 +1057,7 @@ void ScatterUpdate::scatterNDUpdate(const MemoryPtr& mem_data,
     }
 
     size_t sizeToUpdate = srcBlockND[k] * dataSize;
-    parallel_for(idxTupleNum, [&](size_t tupleIdx) {
+    cpu_parallel->parallel_for(idxTupleNum, [&](size_t tupleIdx) {
         size_t indicesOffset = tupleIdx * k;
         size_t dstOffset = 0;
         for (size_t i = 0; i < k; i++) {
diff --git a/src/plugins/intel_cpu/src/nodes/split.cpp b/src/plugins/intel_cpu/src/nodes/split.cpp
index e8cbc83a930eb4..23ebaaf3a52b1d 100644
--- a/src/plugins/intel_cpu/src/nodes/split.cpp
+++ b/src/plugins/intel_cpu/src/nodes/split.cpp
@@ -459,6 +459,7 @@ void Split::selectOptimalPrimitiveDescriptor() {
 }
 
 void Split::optimizedNspc2Ncsp(size_t MB) {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto parentEdge = getParentEdgeAt(0);
     const int rank = parentEdge->getMemory().getShape().getRank();
     const auto parentDims = parentEdge->getMemory().getStaticDims();
@@ -490,7 +491,7 @@ void Split::optimizedNspc2Ncsp(size_t MB) {
         const size_t OC = dims[1];
         const size_t strideOB = OC * strideOC;
 
-        parallel_for2d(MB, DHW, [&](size_t b, size_t j) {
+        cpu_parallel->parallel_for2d(MB, DHW, [&](size_t b, size_t j) {
             const auto* localSrcPtr = srcPtr + b * strideIB + j * strideIW;
             auto* localDstPtr = dstData + b * strideOB + j * dataSize;
             for (size_t c = 0; c < OC; c++) {
diff --git a/src/plugins/intel_cpu/src/nodes/stft.cpp b/src/plugins/intel_cpu/src/nodes/stft.cpp
index ec6545fbf60319..7ecd710cc1fff1 100644
--- a/src/plugins/intel_cpu/src/nodes/stft.cpp
+++ b/src/plugins/intel_cpu/src/nodes/stft.cpp
@@ -116,6 +116,7 @@ void transpose_out4d(const uint8_t* in,
 }  // namespace
 
 void STFT::execute([[maybe_unused]] const dnnl::stream& strm) {
+    const auto& cpu_parallel = context->getCpuParallel();
     const auto* signal = getSrcDataAtPortAs<const float>(DATA_IDX);
     const auto* window = getSrcDataAtPortAs<const float>(WINDOW_IDX);
     auto* rdft_result = getDstDataAtPortAs<float>(0);
@@ -147,7 +148,7 @@ void STFT::execute([[maybe_unused]] const dnnl::stream& strm) {
         dst = dst_mem->getDataAs<float>();
     }
 
-    parallel_for2d(batch_size, num_frames, [&](size_t batch, size_t frame_idx) {
+    cpu_parallel->parallel_for2d(batch_size, num_frames, [&](size_t batch, size_t frame_idx) {
         size_t batch_in_start = batch * signal_length;
         size_t batch_frames_out = batch * num_frames;
 
diff --git a/src/plugins/intel_cpu/src/nodes/topk.cpp b/src/plugins/intel_cpu/src/nodes/topk.cpp
index 95f4bf3a4976a4..c2ad4d958998e3 100644
--- a/src/plugins/intel_cpu/src/nodes/topk.cpp
+++ b/src/plugins/intel_cpu/src/nodes/topk.cpp
@@ -30,7 +30,6 @@
 #include "onednn/iml_type_mapper.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/constant.hpp"
@@ -2248,6 +2247,7 @@ void TopK::execute([[maybe_unused]] const dnnl::stream& strm) {
 }
 
 void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_idx_ptr) {
+    const auto& cpu_parallel = context->getCpuParallel();
     uint8_t* process_ptr = vec_process_ptr.data();
     uint8_t* process_idx_ptr = vec_process_idx_ptr.data();
 
@@ -2256,7 +2256,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id
         size_t IA = div_up(src_dims[1], blk_size);
         size_t OA = div_up(dst_dims[1], blk_size);
         if (algorithm == TopKAlgorithm::topk_bubble_sort) {
-            parallel_for2d(O, I, [&](size_t o, size_t i) {
+            cpu_parallel->parallel_for2d(O, I, [&](size_t o, size_t i) {
                 const uint8_t* in_ptr_a = in_ptr + (o * IA * I + i) * blk_size * data_size;
                 uint8_t* out_ptr_a = out_ptr + (o * OA * I + i) * blk_size * data_size;
                 uint8_t* out_idx_ptr_a = out_idx_ptr + (o * OA * I + i) * blk_size * sizeof(int32_t);
@@ -2264,7 +2264,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id
                 topk_kernel_process(in_ptr_a, out_ptr_a, out_idx_ptr_a, nullptr, nullptr, work_amount);
             });
         } else if (algorithm == TopKAlgorithm::topk_bitonic_sort) {
-            parallel_for(O, [&](size_t o) {
+            cpu_parallel->parallel_for(O, [&](size_t o) {
                 const uint8_t* in_ptr_a = in_ptr + o * IA * I * blk_size * data_size;
                 uint8_t* process_ptr_a = process_ptr + o * IA * I * blk_size * data_size;
                 uint8_t* process_idx_ptr_a = process_idx_ptr + o * IA * I * blk_size * sizeof(int32_t);
@@ -2275,7 +2275,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id
             });
         }
     } else {  // [planar layout] [blocked layout with topk on non-C]
-        parallel_for2d(O, I / blk_size, [&](size_t o, size_t k) {
+        cpu_parallel->parallel_for2d(O, I / blk_size, [&](size_t o, size_t k) {
             const uint8_t* in_ptr_a = in_ptr + (o * A * I + k * blk_size) * data_size;
             uint8_t* process_ptr_a = process_ptr + (o * A * I + k * blk_size) * data_size;
             uint8_t* process_idx_ptr_a = process_idx_ptr + (o * A * I + k * blk_size) * sizeof(int32_t);
@@ -2288,7 +2288,7 @@ void TopK::topk_process(const uint8_t* in_ptr, uint8_t* out_ptr, uint8_t* out_id
         size_t tail_start = I / blk_size * blk_size;
         size_t work_amount = I - tail_start;
         if (work_amount) {
-            parallel_for(O, [&](size_t o) {
+            cpu_parallel->parallel_for(O, [&](size_t o) {
                 const uint8_t* in_ptr_a = in_ptr + (o * A * I + tail_start) * data_size;
                 uint8_t* process_ptr_a = process_ptr + (o * A * I + tail_start) * data_size;
                 uint8_t* process_idx_ptr_a = process_idx_ptr + (o * A * I + tail_start) * sizeof(int32_t);
@@ -2486,9 +2486,10 @@ void TopK::topk_ref_process(const float* src_data,
                             int32_t* dst_idx,
                             const VectorDims& in_dims,
                             std::function<bool(float, float)> compare) const {
+    const auto& cpu_parallel = context->getCpuParallel();
     int after_num = count(in_dims, axis + 1, in_dims.size());
 
-    parallel_for2d(before_num, after_num, [&](int i0, int i1) {
+    cpu_parallel->parallel_for2d(before_num, after_num, [&](int i0, int i1) {
         std::vector<float> max_values(top_k + 1);
         std::vector<int> max_indexes(top_k + 1);
         int s_index = i0 * dim * after_num + i1;
diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp
index dfb18e7ae1f181..6eaef4cbd4e3f8 100644
--- a/src/plugins/intel_cpu/src/nodes/unique.cpp
+++ b/src/plugins/intel_cpu/src/nodes/unique.cpp
@@ -26,7 +26,6 @@
 #include "openvino/cc/selective_build.h"
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
-#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "selective_build.h"
@@ -297,6 +296,7 @@ void Unique::flattenTensorExec() {
 
 template <typename T>
 void Unique::slicedTensorExec() {
+    const auto& cpu_parallel = context->getCpuParallel();
     auto inDataMemPtr = getSrcMemoryAtPort(IN_DATA);
     const auto* srcDataPtr = inDataMemPtr->getDataAs<const T>();
     int* firstTmpPtr = nullptr;
@@ -400,7 +400,7 @@ void Unique::slicedTensorExec() {
     const auto dstOuterStep = innerLen * uniqueLen;
     // Filling of the first output if needed.
     if (sorted || definedOutputs[UNIQUE_DATA]) {
-        parallel_for(uniqueLen, [&](size_t u) {
+        cpu_parallel->parallel_for(uniqueLen, [&](size_t u) {
             const auto* first1 = srcDataPtr + uniqIdx[u] * innerLen;
             auto first2 = dstDataPtr + u * innerLen;
             for (int64_t p = 0LU; p < outerLen; p++) {
@@ -449,7 +449,7 @@ void Unique::slicedTensorExec() {
                 });
 
                 // Permutation
-                parallel_for2d(outerLen, uniqueLen, [&](int64_t ot, size_t u) {
+                cpu_parallel->parallel_for2d(outerLen, uniqueLen, [&](int64_t ot, size_t u) {
                     auto src = dst1 + ot * dstOuterStep + colToSort[u].idx * innerLen;
                     auto dst = dst2 + ot * dstOuterStep + u * innerLen;
 
@@ -457,7 +457,7 @@ void Unique::slicedTensorExec() {
                 });
 
                 if (defined3outputs) {
-                    parallel_for(uniqueLen, [&](size_t u) {
+                    cpu_parallel->parallel_for(uniqueLen, [&](size_t u) {
                         if (definedOutputs[FIRST_UNIQUE_IDX]) {
                             first1[u] = first2[colToSort[u].idx];
                         }
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index e867f84fef18b3..625c9fc560f894 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -272,27 +272,39 @@ void Plugin::get_performance_streams(Config& config, const std::shared_ptr<ov::M
 }
 
 void Plugin::calculate_streams(Config& conf, const std::shared_ptr<ov::Model>& model, bool imported) {
-    const auto model_prefer_name = std::string("MODEL_PREFER_THREADS");
+    std::vector<std::string> model_prefer_name = {std::string("MODEL_PREFER_THREADS_LATENCY"),
+                                                  std::string("MODEL_PREFER_THREADS_THROUGHPUT"),
+                                                  std::string("TBB_PARTITIONER")};
     if (imported && model->has_rt_info("intel_cpu_hints_config")) {
-        // load model_prefer_threads from cache
-        int cache_model_prefer = 0;
+        // load model_prefer_threads and tbbPartitioner from cache
         const auto& hints_config = model->get_rt_info<ov::AnyMap>("intel_cpu_hints_config");
-        const auto it_model_prefer = hints_config.find(model_prefer_name);
-        if (it_model_prefer != hints_config.end()) {
-            try {
-                cache_model_prefer = it_model_prefer->second.as<int>();
-            } catch (const ov::Exception&) {
-                OPENVINO_THROW("Cache file doesn't have valid value for " + model_prefer_name);
+        for (auto& one_name : model_prefer_name) {
+            auto it_model_prefer = hints_config.find(one_name);
+            if (it_model_prefer != hints_config.end()) {
+                try {
+                    if (one_name == std::string("TBB_PARTITIONER")) {
+                        conf.tbbPartitioner = it_model_prefer->second.as<ov::intel_cpu::TbbPartitioner>();
+                    } else if (one_name == std::string("MODEL_PREFER_THREADS_LATENCY")) {
+                        conf.modelPreferThreadsLatency = it_model_prefer->second.as<int>();
+                    } else {
+                        conf.modelPreferThreadsThroughput = it_model_prefer->second.as<int>();
+                    }
+                } catch (const ov::Exception&) {
+                    OPENVINO_THROW("Cache file doesn't have valid value for " + one_name);
+                }
             }
-
-            conf.modelPreferThreads = cache_model_prefer;
         }
+        conf.modelPreferThreads = 0;
     }
     get_performance_streams(conf, model);
     // save model_prefer_threads to model rt_info when loading network
     if (!imported) {
         ov::AnyMap hints_props;
-        hints_props.insert({model_prefer_name, std::to_string(conf.modelPreferThreads)});
+        hints_props.insert({model_prefer_name[0], std::to_string(conf.modelPreferThreadsLatency)});
+        hints_props.insert({model_prefer_name[1], std::to_string(conf.modelPreferThreadsThroughput)});
+        std::stringstream tbb_partitioner;
+        tbb_partitioner << conf.tbbPartitioner;
+        hints_props.insert({model_prefer_name[2], tbb_partitioner.str()});
         model->set_rt_info(hints_props, "intel_cpu_hints_config");
     }
 }
@@ -544,6 +556,7 @@ ov::Any Plugin::get_ro_property(const std::string& name, [[maybe_unused]] const
             RO_property(ov::device::architecture.name()),
         };
         // the whole config is RW before model is loaded.
+
         std::vector<ov::PropertyName> rwProperties{RW_property(ov::num_streams.name()),
                                                    RW_property(ov::inference_num_threads.name()),
                                                    RW_property(ov::enable_profiling.name()),
@@ -561,6 +574,7 @@ ov::Any Plugin::get_ro_property(const std::string& name, [[maybe_unused]] const
                                                    RW_property(ov::log::level.name()),
                                                    RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
                                                    RW_property(ov::intel_cpu::enable_tensor_parallel.name()),
+                                                   RW_property(ov::intel_cpu::tbb_partitioner.name()),
                                                    RW_property(ov::hint::dynamic_quantization_group_size.name()),
                                                    RW_property(ov::hint::kv_cache_precision.name()),
                                                    RW_property(ov::key_cache_precision.name()),
@@ -638,6 +652,9 @@ ov::Any Plugin::get_ro_property(const std::string& name, [[maybe_unused]] const
     if (name == ov::intel_cpu::enable_tensor_parallel) {
         return static_cast<decltype(ov::intel_cpu::enable_tensor_parallel)::value_type>(engConfig.enableTensorParallel);
     }
+    if (name == ov::intel_cpu::tbb_partitioner) {
+        return static_cast<decltype(ov::intel_cpu::tbb_partitioner)::value_type>(engConfig.tbbPartitioner);
+    }
     if (name == ov::execution_devices) {
         return decltype(ov::execution_devices)::value_type{get_device_name()};
     }
diff --git a/src/plugins/intel_cpu/src/thread_pool_imp.cpp b/src/plugins/intel_cpu/src/thread_pool_imp.cpp
new file mode 100644
index 00000000000000..c612deff37f5e0
--- /dev/null
+++ b/src/plugins/intel_cpu/src/thread_pool_imp.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "thread_pool_imp.hpp"
+
+#include <memory>
+#include <oneapi/dnnl/dnnl.hpp>
+#include <oneapi/dnnl/dnnl_common.hpp>
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+#    include <oneapi/dnnl/dnnl_threadpool.hpp>
+
+#    include "cpu_parallel.hpp"
+#    include "openvino/core/parallel.hpp"
+#    include "openvino/runtime/intel_cpu/properties.hpp"
+#endif
+
+namespace ov::intel_cpu {
+
+dnnl::stream make_stream(const dnnl::engine& engine, const std::shared_ptr<ThreadPool>& thread_pool) {  // NOLINT
+#if OV_THREAD == OV_THREAD_TBB_ADAPTIVE
+    static auto g_cpu_parallel = std::make_shared<CpuParallel>(ov::intel_cpu::TbbPartitioner::STATIC);
+    auto stream = dnnl::threadpool_interop::make_stream(
+        engine,
+        thread_pool ? thread_pool.get() : g_cpu_parallel->get_thread_pool().get());
+#else
+    auto stream = dnnl::stream(engine);
+#endif
+    return stream;
+}
+
+}  // namespace ov::intel_cpu
diff --git a/src/plugins/intel_cpu/src/thread_pool_imp.hpp b/src/plugins/intel_cpu/src/thread_pool_imp.hpp
new file mode 100644
index 00000000000000..686d6bb91312fb
--- /dev/null
+++ b/src/plugins/intel_cpu/src/thread_pool_imp.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <common/dnnl_thread.hpp>
+#include <oneapi/dnnl/dnnl_threadpool.hpp>
+#include <oneapi/dnnl/dnnl_threadpool_iface.hpp>
+
+#include "cpu_parallel.hpp"
+#include "openvino/core/parallel.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
+
+namespace ov::intel_cpu {
+
+class ThreadPool : public dnnl::threadpool_interop::threadpool_iface {
+public:
+    ThreadPool() = delete;
+    ThreadPool(ThreadPool&) = delete;
+    ThreadPool& operator=(ThreadPool&) = delete;
+    ThreadPool(ThreadPool&&) = delete;
+    ThreadPool& operator=(ThreadPool&&) = delete;
+
+    explicit ThreadPool(const CpuParallel& cpu_parallel) : m_cpu_parallel(cpu_parallel) {}
+
+    [[nodiscard]] int get_num_threads() const override {
+        return m_cpu_parallel.get_num_threads();
+    }
+    [[nodiscard]] bool get_in_parallel() const override {
+        return false;
+    }
+    [[nodiscard]] uint64_t get_flags() const override {
+        return 0;
+    }
+    void parallel_for(int n, const std::function<void(int, int)>& fn) override {
+        m_cpu_parallel.parallel_simple(n, fn);
+    }
+
+private:
+    const CpuParallel& m_cpu_parallel;
+};
+
+dnnl::stream make_stream(const dnnl::engine& engine, const std::shared_ptr<ThreadPool>& thread_pool = nullptr);
+
+}  // namespace ov::intel_cpu
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp
index b1ad52580d4789..e766ff254aa132 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/repack_matmul_weights.cpp
@@ -104,7 +104,8 @@ bool RepackMatMulWeights::run_on_model(const std::shared_ptr<ov::Model>& model)
                                                                        eng,
                                                                        m_context->getParamsCache(),
                                                                        m_context->getWeightsCache(),
-                                                                       nullptr);
+                                                                       nullptr,
+                                                                       m_context->getCpuParallel()->get_thread_pool());
         weights_idxs.insert(i);
     }
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
index b78ef0204bb19a..afb0c4728c1577 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
@@ -48,6 +48,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable
         RO_property(ov::log::level.name()),
         RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RO_property(ov::intel_cpu::enable_tensor_parallel.name()),
+        RO_property(ov::intel_cpu::tbb_partitioner.name()),
         RO_property(ov::hint::dynamic_quantization_group_size.name()),
         RO_property(ov::hint::kv_cache_precision.name()),
         RO_property(ov::key_cache_precision.name()),
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
index bb786f3828becc..39a30f394074ca 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
@@ -61,6 +61,7 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) {
         RW_property(ov::log::level.name()),
         RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RW_property(ov::intel_cpu::enable_tensor_parallel.name()),
+        RW_property(ov::intel_cpu::tbb_partitioner.name()),
         RW_property(ov::hint::dynamic_quantization_group_size.name()),
         RW_property(ov::hint::kv_cache_precision.name()),
         RW_property(ov::key_cache_precision.name()),
diff --git a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp
index 7864dd2426c8cf..3d927e71afce8b 100644
--- a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp
@@ -16,6 +16,7 @@
 #include "common_test_utils/common_utils.hpp"
 #include "memory_control.hpp"
 #include "nodes/input.h"
+#include "thread_pool_imp.hpp"
 
 using namespace ov::intel_cpu;
 namespace ReorderCPUTest {
@@ -139,7 +140,7 @@ class ReorderCPUTestGraph {
             n->initSupportedPrimitiveDescriptors();
             n->selectPrimitiveDescriptorByIndex(0);
         }
-        stream = dnnl::stream{cpuEngine};
+        stream = ov::intel_cpu::make_stream(cpuEngine, context->getCpuParallel()->get_thread_pool());
     }
 
 protected:
diff --git a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt
index d32a2b0088d502..24bf14639919b2 100644
--- a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt
@@ -49,7 +49,11 @@ function(ov_add_onednn)
     set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "" FORCE)
     set(DNNL_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
     set(DNNL_BUILD_TESTS OFF CACHE BOOL "" FORCE)
-    set(DNNL_CPU_RUNTIME "${THREADING}" CACHE STRING "" FORCE)
+    if("${THREADING}" STREQUAL "TBB_ADAPTIVE")
+        set(DNNL_CPU_RUNTIME "THREADPOOL" CACHE STRING "" FORCE)
+    else()
+        set(DNNL_CPU_RUNTIME "${THREADING}" CACHE STRING "" FORCE)
+    endif()
     set(DNNL_GPU_RUNTIME "NONE" CACHE STRING "" FORCE)
     set(DNNL_BLAS_VENDOR "NONE" CACHE STRING "" FORCE)
     set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "SSE41" CACHE STRING "" FORCE)
diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn
index 0cad963300cd2b..6e4715d2d6f635 160000
--- a/src/plugins/intel_cpu/thirdparty/onednn
+++ b/src/plugins/intel_cpu/thirdparty/onednn
@@ -1 +1 @@
-Subproject commit 0cad963300cd2b80c371cb66d435c60ad0e5edd7
+Subproject commit 6e4715d2d6f635991eeb67e1ad584e94542c9f23