Skip to content

Commit bbed74e

Browse files
authored
[GPU] Optimize Eltwise (#32384)
+ Fusing Eltwise to Broadcast + Implemented fusing operation to broadcast_ref kernel ### Description of the issue - From analysis of Qwen-Reranker (CVS-173218), Eltwise occupied 6% execution time with ref kernel. it can be optimized out. #### The code and line that caused this issue - Modified broadcast ref kernel : kernel_selector/cl_kernels/broadcast_gpu_ref.cl - Added condition for fusing Eltwise to Broadcast : graph_optimizer/prepare_primitive_fusing.cpp - Added logic to fused post-ops : broadcast/broadcast_kernel_base.cpp #### Reproduction step and snapshot - target model QWen3-Reranker-0.6B is in openvino_notebook (notebooks/Qwen3-embedding) - Reproduced by benchmark_app `./benchmark_app -m openvino_notebooks/notebooks/qwen3-embedding/Qwen3-Reranker-0.6B/FP16/openvino_model.xml -shape [64,256] -d GPU -hint latency -api sync -nireq 1 -niter 1 ` #### Checklist - [x] Is it a proper fix? - [x] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? ### Tickets: - CVS-173226 --------- Signed-off-by: Min, Byungil <[email protected]>
1 parent 05576a1 commit bbed74e

File tree

5 files changed

+184
-13
lines changed

5 files changed

+184
-13
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include "reduce_inst.h"
4444
#include "group_normalization_inst.h"
4545
#include "lora_inst.h"
46+
#include "broadcast_inst.h"
4647
#include <vector>
4748
#include <map>
4849
#include <list>
@@ -751,6 +752,23 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
751752
return lora_is_single_user && is_simple_lora;
752753
};
753754

755+
auto broadcast_supports_fusings = [&](broadcast_node& bcast_node) -> bool {
756+
if (bcast_node.get_outputs_count() != 1)
757+
return false;
758+
759+
bool out_eltw = bcast_node.get_users().front()->is_type<eltwise>();
760+
if (!out_eltw)
761+
return false;
762+
763+
auto input_layout = bcast_node.get_output_layout();
764+
auto output_layout = bcast_node.get_users().front()->get_output_layout();
765+
if (input_layout.data_type != output_layout.data_type) {
766+
return false;
767+
}
768+
769+
return true;
770+
};
771+
754772
auto fuse_activation_f = [&](activation_node& activation_node) {
755773
GPU_DEBUG_IF(p.get_config().get_disable_post_ops_fusions() != 0) {
756774
GPU_DEBUG_IF(p.get_config().get_disable_post_ops_fusions() != 11)
@@ -1055,7 +1073,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
10551073
reduce_supports_fusings(parents[i].first->as<reduce>())) ||
10561074
(parents[i].first->is_type<lrn>()) ||
10571075
(parents[i].first->is_type<lora>() &&
1058-
lora_supports_fusings(parents[i].first->as<lora>()));
1076+
lora_supports_fusings(parents[i].first->as<lora>())) ||
1077+
(parents[i].first->is_type<broadcast>() &&
1078+
broadcast_supports_fusings(parents[i].first->as<broadcast>()));
10591079
}
10601080

10611081
// Disable fusion to a node on constant path when second input is in data flow

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,11 @@ inline uint FUNC(get_idx_pos)(OPTIONAL_SHAPE_INFO_ARG uint out_b, uint out_f, ui
258258
KERNEL(broadcast_gpu_ref)(
259259
OPTIONAL_SHAPE_INFO_ARG
260260
const __global INPUT0_TYPE* input,
261-
__global OUTPUT_TYPE* output)
261+
__global OUTPUT_TYPE* output
262+
#if HAS_FUSED_OPS_DECLS
263+
, FUSED_OPS_DECLS
264+
#endif
265+
)
262266
{
263267
#if SAME_RANK_PLAIN_FORMAT == 1
264268
const bool use_opt_code = INPUT0_SIZE_X == OUTPUT_SIZE_X && INPUT0_SIZE_Y != OUTPUT_SIZE_Y
@@ -319,11 +323,20 @@ KERNEL(broadcast_gpu_ref)(
319323
if (remained_y < y_stride)
320324
y_nums += remained_y;
321325

326+
#if HAS_FUSED_OPS
327+
OUTPUT_TYPE res = 0;
328+
#endif
322329
if (OUTPUT_SIZE_X < VEC_SIZE) {
323330
uint output_idx = out_pos;
324-
unroll_for(uint j = 0; j < y_nums; j++) {
325-
unroll_for(uint i = 0; i < x_stride; i++) {
326-
output[output_idx + i] = TO_OUTPUT_TYPE(input[idx_pos + i]);
331+
unroll_for(uint i = 0; i < y_nums; i++) {
332+
unroll_for(uint offset = 0; offset < x_stride; offset++) {
333+
#if HAS_FUSED_OPS
334+
res = TO_OUTPUT_TYPE(input[idx_pos + offset]);
335+
FUSED_OPS
336+
output[output_idx + offset] = FUSED_OPS_RESULT;
337+
#else
338+
output[output_idx + offset] = TO_OUTPUT_TYPE(input[idx_pos + offset]);
339+
#endif
327340
}
328341
output_idx += OUTPUT_SIZE_X;
329342
}
@@ -332,19 +345,35 @@ KERNEL(broadcast_gpu_ref)(
332345
INPUT0_VTYPE input_vec = VLOAD(0, &input[idx_pos]);
333346
unroll_for(uint i = 0; i < y_nums; i++) {
334347
OUTPUT_VTYPE out_v;
335-
for (int j = 0; j < VEC_SIZE; ++j)
336-
out_v[j] = TO_OUTPUT_TYPE(input_vec[j]);
348+
for (int offset = 0; offset < VEC_SIZE; ++offset) {
349+
#if HAS_FUSED_OPS
350+
res = TO_OUTPUT_TYPE(input_vec[offset]);
351+
FUSED_OPS
352+
out_v[offset] = FUSED_OPS_RESULT;
353+
#else
354+
out_v[offset] = TO_OUTPUT_TYPE(input_vec[offset]);
355+
#endif
356+
}
337357
VSTORE(out_v, 0, &output[output_idx]);
338358
output_idx += OUTPUT_SIZE_X;
339359
}
340360

341361
if (gdim0 < x_leftovers) {
342362
INPUT0_TYPE input_val = input[idx_pos + x_stride];
363+
#if HAS_FUSED_OPS
364+
res = TO_OUTPUT_TYPE(input_val);
365+
#endif
343366

344-
output_idx = out_pos;
367+
uint offset = x_stride;
345368
unroll_for(uint i = 0; i < y_nums; i++) {
346-
output[output_idx + x_stride] = TO_OUTPUT_TYPE(input_val);
347-
output_idx += OUTPUT_SIZE_X;
369+
#if HAS_FUSED_OPS
370+
FUSED_OPS
371+
output[out_pos + offset] = FUSED_OPS_RESULT;
372+
offset += OUTPUT_SIZE_X;
373+
#else
374+
output[out_pos + offset] = TO_OUTPUT_TYPE(input_val);
375+
offset += OUTPUT_SIZE_X;
376+
#endif
348377
}
349378
}
350379
}
@@ -379,7 +408,14 @@ KERNEL(broadcast_gpu_ref)(
379408
const uint out_pos = OUTPUT_GET_INDEX(out_b, out_f, out_y, out_x);
380409
const uint idx_pos = FUNC_CALL(get_idx_pos)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_y, out_x);
381410
#endif
382-
output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]);
411+
#if HAS_FUSED_OPS
412+
uint offset = 0;
413+
OUTPUT_TYPE res = TO_OUTPUT_TYPE(input[idx_pos]);
414+
FUSED_OPS
415+
output[out_pos] = FUSED_OPS_RESULT;
416+
#else
417+
output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]);
418+
#endif
383419
}
384420
}
385421

src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_base.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,24 @@ JitConstants BroadcastKernelBase::GetJitConstants(const broadcast_params& params
3232
jit.AddConstants({MakeJitConstant("VEC_SIZE", VEC_SIZE)});
3333
jit.AddConstants({MakeJitConstant("Y_BLOCKS", Y_BLOCKS)});
3434
jit.AddConstants({MakeJitConstant("SAME_RANK_PLAIN_FORMAT", is_same_planar_format(in_layout, out_layout))});
35+
36+
// Fused post_ops
37+
if (!params.fused_ops.empty()) {
38+
kernel_selector::Datatype input_dt = params.outputs[0].GetDType();
39+
std::vector<std::string> idx_order;
40+
if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 4) {
41+
idx_order = {"out_b", "out_f", "out_y", "out_x + offset"};
42+
} else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 5) {
43+
idx_order = {"out_b", "out_f", "out_z", "out_y", "out_x + offset"};
44+
} else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 6) {
45+
idx_order = {"out_b", "out_f", "out_w", "out_z", "out_y", "out_x + offset"};
46+
}
47+
48+
FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, 1};
49+
50+
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
51+
}
52+
3553
return jit;
3654
}
3755

@@ -148,7 +166,7 @@ KernelsData BroadcastKernelBase::GetCommonKernelsData(const Params& params) cons
148166
false,
149167
false,
150168
1,
151-
0,
169+
GetFusedPrimitiveInputsCount(params),
152170
1,
153171
prim_params.is_shape_agnostic);
154172

src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ class BroadcastKernelRef : public BroadcastKernelBase {
1515
KernelsPriority GetKernelsPriority(const Params& params) const override;
1616
ParamsKey GetSupportedKey() const override;
1717
std::vector<FusedOpType> GetSupportedFusedOps() const override {
18-
return { FusedOpType::REORDER };
18+
return { FusedOpType::ELTWISE,
19+
FusedOpType::REORDER
20+
};
1921
}
2022
};
2123
} // namespace kernel_selector
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "test_utils.h"
6+
#include "fusion_test_common.hpp"
7+
8+
#include <intel_gpu/primitives/input_layout.hpp>
9+
#include <intel_gpu/primitives/broadcast.hpp>
10+
#include <intel_gpu/primitives/eltwise.hpp>
11+
12+
#include <cmath>
13+
14+
using namespace cldnn;
15+
using namespace ::tests;
16+
17+
namespace {
18+
struct broadcast_test_params {
19+
ov::PartialShape input_size1; // input for broadcast
20+
ov::PartialShape input_size2; // other input connected to output of broadcast
21+
data_types input_type1; // input data-type of 'input_size1'
22+
data_types input_type2;
23+
format input_format;
24+
data_types default_type;
25+
format default_format;
26+
ov::AxisSet broadcast_axes;
27+
size_t expected_fused_primitives;
28+
size_t expected_not_fused_primitives;
29+
};
30+
31+
class BroadcastFusingTest : public ::BaseFusingTest<broadcast_test_params> {
32+
public:
33+
void execute(broadcast_test_params& p, bool count_reorder = false) {
34+
auto input_prim = get_mem(get_input_layout1(p));
35+
auto input_prim2 = get_mem(get_input_layout2(p));
36+
37+
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
38+
network network_fused(this->engine, this->topology_fused, cfg_fused);
39+
40+
auto inputs = network_fused.get_input_ids();
41+
if (std::find(inputs.begin(), inputs.end(), "input") != inputs.end()) {
42+
network_fused.set_input_data("input", input_prim);
43+
network_not_fused.set_input_data("input", input_prim);
44+
}
45+
if (std::find(inputs.begin(), inputs.end(), "input2") != inputs.end()) {
46+
network_fused.set_input_data("input2", input_prim2);
47+
network_not_fused.set_input_data("input2", input_prim2);
48+
}
49+
50+
compare(network_not_fused, network_fused, p, count_reorder);
51+
}
52+
53+
layout get_input_layout1(broadcast_test_params& p) {
54+
return layout{ p.input_size1, p.input_type1, p.input_format };
55+
}
56+
57+
layout get_input_layout2(broadcast_test_params& p) {
58+
return layout{ p.input_size2, p.input_type2, p.input_format };
59+
}
60+
};
61+
} // namespace
62+
63+
#define CASE_BROADCAST_FP16_1 { 1, 16, 4, 4 }, { 2, 16, 4, 4 }, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx, {0}
64+
#define CASE_BROADCAST_FP16_2 { 2, 1, 4, 4, 4 }, { 2, 16, 4, 4, 4 }, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx, {1}
65+
#define CASE_BROADCAST_FP16_3 { 2, 16, 4, 4, 1 }, { 2, 16, 4, 4, 8 }, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx, {4}
66+
67+
#define CASE_BROADCAST_FP16_1_BLK { 2, 16, 4, 1 }, { 2, 16, 4, 4 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx, {3}
68+
#define CASE_BROADCAST_FP16_2_BLK { 1, 16, 4, 4 }, { 2, 16, 4, 4 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx, {0}
69+
#define CASE_BROADCAST_FP16_3_BLK { 2, 16, 4, 1 }, { 2, 16, 4, 4 }, data_types::u8, data_types::i8, format::b_fs_yx_fsv32, data_types::f16, format::bfyx, {3}
70+
71+
class broadcast_fused_prims : public BroadcastFusingTest {};
72+
TEST_P(broadcast_fused_prims, broadcast_activation_with_broadcast) {
73+
auto p = GetParam();
74+
create_topologies(
75+
input_layout("input", get_input_layout1(p)),
76+
input_layout("input2", get_input_layout2(p)),
77+
broadcast("broadcast", input_info("input"), get_input_layout2(p).get_shape(), ov::AxisSet(p.broadcast_axes), ov::op::BroadcastType::NUMPY),
78+
eltwise("eltwise", { input_info("broadcast"), input_info("input2") }, eltwise_mode::sum, p.default_type),
79+
activation("activation", input_info("eltwise"), activation_func::abs),
80+
reorder("out", input_info("activation"), p.default_format, data_types::f32)
81+
);
82+
83+
tolerance = default_tolerance(p.input_type1);
84+
execute(p);
85+
}
86+
87+
INSTANTIATE_TEST_SUITE_P(fusings_gpu, broadcast_fused_prims, ::testing::ValuesIn(std::vector<broadcast_test_params>{
88+
broadcast_test_params{ CASE_BROADCAST_FP16_1, 4, 5 },
89+
broadcast_test_params{ CASE_BROADCAST_FP16_2, 4, 5 },
90+
broadcast_test_params{ CASE_BROADCAST_FP16_3, 4, 5 },
91+
92+
broadcast_test_params{ CASE_BROADCAST_FP16_1_BLK, 4, 5 },
93+
broadcast_test_params{ CASE_BROADCAST_FP16_2_BLK, 4, 5 },
94+
broadcast_test_params{ CASE_BROADCAST_FP16_3_BLK, 4, 5 },
95+
}));

0 commit comments

Comments
 (0)