[GPU] Optimize Eltwise (#32384)

byungilm · web-flow · commit bbed74ed1a65 · 2025-10-17T05:22:55.000Z
+ Fusing Eltwise to Broadcast + Implemented fusing operation to broadcast_ref kernel ### Description of the issue - From analysis of Qwen-Reranker (CVS-173218), Eltwise occupied 6% execution time with ref kernel. it can be optimized out. #### The code and line that caused this issue - Modified broadcast ref kernel : kernel_selector/cl_kernels/broadcast_gpu_ref.cl - Added condition for fusing Eltwise to Broadcast : graph_optimizer/prepare_primitive_fusing.cpp - Added logic to fused post-ops : broadcast/broadcast_kernel_base.cpp #### Reproduction step and snapshot - target model QWen3-Reranker-0.6B is in openvino_notebook (notebooks/Qwen3-embedding) - Reproduced by benchmark_app `./benchmark_app -m openvino_notebooks/notebooks/qwen3-embedding/Qwen3-Reranker-0.6B/FP16/openvino_model.xml -shape [64,256] -d GPU -hint latency -api sync -nireq 1 -niter 1 ` #### Checklist - [x] Is it a proper fix? - [x] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? ### Tickets: - CVS-173226 --------- Signed-off-by: Min, Byungil <byungil.min@intel.com>
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -43,6 +43,7 @@
 #include "reduce_inst.h"
 #include "group_normalization_inst.h"
 #include "lora_inst.h"
+#include "broadcast_inst.h"
 #include <vector>
 #include <map>
 #include <list>
@@ -751,6 +752,23 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
             return lora_is_single_user && is_simple_lora;
         };
 
+        auto broadcast_supports_fusings = [&](broadcast_node& bcast_node) -> bool {
+            if (bcast_node.get_outputs_count() != 1)
+                return false;
+
+            bool out_eltw = bcast_node.get_users().front()->is_type<eltwise>();
+            if (!out_eltw)
+                return false;
+
+            auto input_layout = bcast_node.get_output_layout();
+            auto output_layout = bcast_node.get_users().front()->get_output_layout();
+            if (input_layout.data_type != output_layout.data_type) {
+                return false;
+            }
+
+            return true;
+        };
+
         auto fuse_activation_f = [&](activation_node& activation_node) {
             GPU_DEBUG_IF(p.get_config().get_disable_post_ops_fusions() != 0) {
                 GPU_DEBUG_IF(p.get_config().get_disable_post_ops_fusions() != 11)
@@ -1055,7 +1073,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                                        reduce_supports_fusings(parents[i].first->as<reduce>())) ||
                                       (parents[i].first->is_type<lrn>()) ||
                                       (parents[i].first->is_type<lora>() &&
-                                       lora_supports_fusings(parents[i].first->as<lora>()));
+                                       lora_supports_fusings(parents[i].first->as<lora>())) ||
+                                       (parents[i].first->is_type<broadcast>() &&
+                                       broadcast_supports_fusings(parents[i].first->as<broadcast>()));
             }
 
             // Disable fusion to a node on constant path when second input is in data flow
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl
@@ -258,7 +258,11 @@ inline uint FUNC(get_idx_pos)(OPTIONAL_SHAPE_INFO_ARG uint out_b, uint out_f, ui
 KERNEL(broadcast_gpu_ref)(
     OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output)
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
 #if SAME_RANK_PLAIN_FORMAT == 1
     const bool use_opt_code = INPUT0_SIZE_X == OUTPUT_SIZE_X && INPUT0_SIZE_Y != OUTPUT_SIZE_Y
@@ -319,11 +323,20 @@ KERNEL(broadcast_gpu_ref)(
         if (remained_y < y_stride)
             y_nums += remained_y;
 
+        #if HAS_FUSED_OPS
+            OUTPUT_TYPE res = 0;
+        #endif
         if (OUTPUT_SIZE_X < VEC_SIZE) {
             uint output_idx = out_pos;
-            unroll_for(uint j = 0; j < y_nums; j++) {
-                unroll_for(uint i = 0; i < x_stride; i++) {
-                    output[output_idx + i] = TO_OUTPUT_TYPE(input[idx_pos + i]);
+            unroll_for(uint i = 0; i < y_nums; i++) {
+                unroll_for(uint offset = 0; offset < x_stride; offset++) {
+                    #if HAS_FUSED_OPS
+                        res = TO_OUTPUT_TYPE(input[idx_pos + offset]);
+                        FUSED_OPS
+                        output[output_idx + offset] = FUSED_OPS_RESULT;
+                    #else
+                        output[output_idx + offset] = TO_OUTPUT_TYPE(input[idx_pos + offset]);
+                    #endif
                 }
                 output_idx += OUTPUT_SIZE_X;
             }
@@ -332,19 +345,35 @@ KERNEL(broadcast_gpu_ref)(
             INPUT0_VTYPE input_vec = VLOAD(0, &input[idx_pos]);
             unroll_for(uint i = 0; i < y_nums; i++) {
                 OUTPUT_VTYPE out_v;
-                for (int j = 0; j < VEC_SIZE; ++j)
-                    out_v[j] = TO_OUTPUT_TYPE(input_vec[j]);
+                for (int offset = 0; offset < VEC_SIZE; ++offset) {
+                    #if HAS_FUSED_OPS
+                        res = TO_OUTPUT_TYPE(input_vec[offset]);
+                        FUSED_OPS
+                        out_v[offset] = FUSED_OPS_RESULT;
+                    #else
+                        out_v[offset] = TO_OUTPUT_TYPE(input_vec[offset]);
+                    #endif
+                }
                 VSTORE(out_v, 0, &output[output_idx]);
                 output_idx += OUTPUT_SIZE_X;
             }
 
             if (gdim0 < x_leftovers) {
                 INPUT0_TYPE input_val = input[idx_pos + x_stride];
+                #if HAS_FUSED_OPS
+                    res = TO_OUTPUT_TYPE(input_val);
+                #endif
 
-                output_idx = out_pos;
+                uint offset = x_stride;
                 unroll_for(uint i = 0; i < y_nums; i++) {
-                    output[output_idx + x_stride] = TO_OUTPUT_TYPE(input_val);
-                    output_idx += OUTPUT_SIZE_X;
+                    #if HAS_FUSED_OPS
+                        FUSED_OPS
+                        output[out_pos + offset] = FUSED_OPS_RESULT;
+                        offset += OUTPUT_SIZE_X;
+                    #else
+                        output[out_pos + offset] = TO_OUTPUT_TYPE(input_val);
+                        offset += OUTPUT_SIZE_X;
+                    #endif
                 }
             }
         }
@@ -379,7 +408,14 @@ KERNEL(broadcast_gpu_ref)(
         const uint out_pos = OUTPUT_GET_INDEX(out_b, out_f, out_y, out_x);
         const uint idx_pos = FUNC_CALL(get_idx_pos)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_y, out_x);
 #endif
-        output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]);
+        #if HAS_FUSED_OPS
+            uint offset = 0;
+            OUTPUT_TYPE res = TO_OUTPUT_TYPE(input[idx_pos]);
+            FUSED_OPS
+            output[out_pos] = FUSED_OPS_RESULT;
+        #else
+            output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]);
+        #endif
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_base.cpp
@@ -32,6 +32,24 @@ JitConstants BroadcastKernelBase::GetJitConstants(const broadcast_params& params
     jit.AddConstants({MakeJitConstant("VEC_SIZE", VEC_SIZE)});
     jit.AddConstants({MakeJitConstant("Y_BLOCKS", Y_BLOCKS)});
     jit.AddConstants({MakeJitConstant("SAME_RANK_PLAIN_FORMAT", is_same_planar_format(in_layout, out_layout))});
+
+    // Fused post_ops
+    if (!params.fused_ops.empty()) {
+        kernel_selector::Datatype input_dt = params.outputs[0].GetDType();
+        std::vector<std::string> idx_order;
+        if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 4) {
+            idx_order = {"out_b", "out_f", "out_y", "out_x + offset"};
+        } else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 5) {
+            idx_order = {"out_b", "out_f", "out_z", "out_y", "out_x + offset"};
+        } else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 6) {
+            idx_order = {"out_b", "out_f", "out_w", "out_z", "out_y", "out_x + offset"};
+        }
+
+        FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, 1};
+
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
     return jit;
 }
 
@@ -148,7 +166,7 @@ KernelsData BroadcastKernelBase::GetCommonKernelsData(const Params& params) cons
                      false,
                      false,
                      1,
-                     0,
+                     GetFusedPrimitiveInputsCount(params),
                      1,
                      prim_params.is_shape_agnostic);
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h
@@ -15,7 +15,9 @@ class BroadcastKernelRef : public BroadcastKernelBase {
     KernelsPriority GetKernelsPriority(const Params& params) const override;
     ParamsKey GetSupportedKey() const override;
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::REORDER };
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::REORDER
+        };
     }
 };
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/broadcast_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/broadcast_fusion_test.cpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+#include "fusion_test_common.hpp"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/broadcast.hpp>
+#include <intel_gpu/primitives/eltwise.hpp>
+
+#include <cmath>
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace {
+struct broadcast_test_params {
+    ov::PartialShape input_size1;  // input for broadcast
+    ov::PartialShape input_size2;  // other input connected to output of broadcast
+    data_types input_type1;        // input data-type of 'input_size1'
+    data_types input_type2;
+    format input_format;
+    data_types default_type;
+    format default_format;
+    ov::AxisSet broadcast_axes;
+    size_t expected_fused_primitives;
+    size_t expected_not_fused_primitives;
+};
+
+class BroadcastFusingTest : public ::BaseFusingTest<broadcast_test_params> {
+public:
+    void execute(broadcast_test_params& p, bool count_reorder = false) {
+        auto input_prim = get_mem(get_input_layout1(p));
+        auto input_prim2 = get_mem(get_input_layout2(p));
+
+        network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
+        network network_fused(this->engine, this->topology_fused, cfg_fused);
+
+        auto inputs = network_fused.get_input_ids();
+        if (std::find(inputs.begin(), inputs.end(), "input") != inputs.end()) {
+            network_fused.set_input_data("input", input_prim);
+            network_not_fused.set_input_data("input", input_prim);
+        }
+        if (std::find(inputs.begin(), inputs.end(), "input2") != inputs.end()) {
+            network_fused.set_input_data("input2", input_prim2);
+            network_not_fused.set_input_data("input2", input_prim2);
+        }
+
+        compare(network_not_fused, network_fused, p, count_reorder);
+    }
+
+    layout get_input_layout1(broadcast_test_params& p) {
+        return layout{ p.input_size1, p.input_type1, p.input_format };
+    }
+
+    layout get_input_layout2(broadcast_test_params& p) {
+        return layout{ p.input_size2, p.input_type2, p.input_format };
+    }
+};
+}  // namespace
+
+#define CASE_BROADCAST_FP16_1         { 1, 16, 4, 4 }, { 2, 16, 4, 4 },        data_types::f16, data_types::f16, format::bfyx,           data_types::f16,  format::bfyx,          {0}
+#define CASE_BROADCAST_FP16_2         { 2, 1,  4, 4, 4 }, { 2, 16, 4, 4, 4 },  data_types::f16, data_types::f16, format::bfzyx,          data_types::f16,  format::bfzyx,         {1}
+#define CASE_BROADCAST_FP16_3         { 2, 16, 4, 4, 1 }, { 2, 16, 4, 4, 8 },  data_types::f16, data_types::f16, format::bfzyx,          data_types::f16,  format::bfzyx,         {4}
+
+#define CASE_BROADCAST_FP16_1_BLK     { 2, 16, 4, 1 }, { 2, 16, 4, 4 },        data_types::f16, data_types::f16, format::b_fs_yx_fsv16,  data_types::f16,  format::bfyx,          {3}
+#define CASE_BROADCAST_FP16_2_BLK     { 1, 16, 4, 4 }, { 2, 16, 4, 4 },        data_types::f16, data_types::f16, format::b_fs_yx_fsv16,  data_types::f16,  format::bfyx,          {0}
+#define CASE_BROADCAST_FP16_3_BLK     { 2, 16, 4, 1 }, { 2, 16, 4, 4 },        data_types::u8, data_types::i8,   format::b_fs_yx_fsv32,  data_types::f16,  format::bfyx,          {3}
+
+class broadcast_fused_prims : public BroadcastFusingTest {};
+TEST_P(broadcast_fused_prims, broadcast_activation_with_broadcast) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout1(p)),
+        input_layout("input2", get_input_layout2(p)),
+        broadcast("broadcast", input_info("input"), get_input_layout2(p).get_shape(), ov::AxisSet(p.broadcast_axes), ov::op::BroadcastType::NUMPY),
+        eltwise("eltwise", { input_info("broadcast"), input_info("input2") }, eltwise_mode::sum, p.default_type),
+        activation("activation", input_info("eltwise"), activation_func::abs),
+        reorder("out", input_info("activation"), p.default_format, data_types::f32)
+    );
+
+    tolerance = default_tolerance(p.input_type1);
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, broadcast_fused_prims, ::testing::ValuesIn(std::vector<broadcast_test_params>{
+    broadcast_test_params{ CASE_BROADCAST_FP16_1, 4, 5 },
+    broadcast_test_params{ CASE_BROADCAST_FP16_2, 4, 5 },
+    broadcast_test_params{ CASE_BROADCAST_FP16_3, 4, 5 },
+
+    broadcast_test_params{ CASE_BROADCAST_FP16_1_BLK, 4, 5 },
+    broadcast_test_params{ CASE_BROADCAST_FP16_2_BLK, 4, 5 },
+    broadcast_test_params{ CASE_BROADCAST_FP16_3_BLK, 4, 5 },
+}));