Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
4403d55
Refactor split-image implementation: simplify code and remove redunda…
JH-Leon-KIM-AMD Sep 26, 2025
74a7179
Add padding debug output to split-image implementation
JH-Leon-KIM-AMD Sep 28, 2025
e94a485
Fix sign comparison warning after rebase with origin/develop
JH-Leon-KIM-AMD Sep 28, 2025
9b31169
Fix Split-N with groups bug and clean up unused parameters
JH-Leon-KIM-AMD Sep 28, 2025
9c7df2a
Implement recursive queue-based split-image detection and calculation
JH-Leon-KIM-AMD Sep 29, 2025
1f48a5a
WIP: Split-Image investigation - found architecture mismatch
JH-Leon-KIM-AMD Sep 30, 2025
68af577
Add 1D split-image implementation for grouped convolution (N=1 working)
JH-Leon-KIM-AMD Oct 1, 2025
b51fedb
Add basic split-image implementation for 1D/2D/3D grouped convolution
JH-Leon-KIM-AMD Oct 2, 2025
2992c0b
Refactor split-image to unified structure for 1D/2D/3D
JH-Leon-KIM-AMD Oct 2, 2025
5024d43
Add safety checks for split-image in all dimensions
JH-Leon-KIM-AMD Oct 2, 2025
51c77f7
Fix Split-N + Split-Image compatibility issue
JH-Leon-KIM-AMD Oct 3, 2025
74671dd
Implement unified threshold for Split-N and Split-Image
JH-Leon-KIM-AMD Oct 3, 2025
49e46de
Comment out outdated split-image code (SplitConvProblem/LaunchKernelW…
JH-Leon-KIM-AMD Oct 3, 2025
eebb88d
Implement recursive split-image with depth limit (MAX_DEPTH=10)
JH-Leon-KIM-AMD Oct 3, 2025
73a6adf
Summary of recursive split-image implementation:
JH-Leon-KIM-AMD Oct 3, 2025
34326e2
Add comment explaining MAX_DEPTH capacity for 2GB threshold
JH-Leon-KIM-AMD Oct 6, 2025
414e9a1
Refactor: move recursive split-image logic to transformer
JH-Leon-KIM-AMD Oct 6, 2025
8c6d280
Apply clang-format-18 formatting
JH-Leon-KIM-AMD Oct 6, 2025
54869a3
Fix clang-format-18 issues in forward kernel
JH-Leon-KIM-AMD Oct 6, 2025
08bc24d
Merge branch 'develop' into LWPCK-3052-grouped-conv-split-image
JH-Leon-KIM-AMD Oct 6, 2025
e75944e
Update include/ck_tile/ops/grouped_convolution/utils/transform_conv_f…
JH-Leon-KIM-AMD Oct 8, 2025
a1f9d7e
Update include/ck_tile/ops/grouped_convolution/utils/transform_conv_f…
JH-Leon-KIM-AMD Oct 8, 2025
ca06bfe
Update include/ck_tile/ops/grouped_convolution/kernel/grouped_convolu…
JH-Leon-KIM-AMD Oct 8, 2025
5fbaecf
Update include/ck_tile/ops/grouped_convolution/kernel/grouped_convolu…
JH-Leon-KIM-AMD Oct 8, 2025
0ea5ece
Merge branch 'develop' into LWPCK-3052-grouped-conv-split-image
JH-Leon-KIM-AMD Oct 8, 2025
03e44ee
Merge branch 'develop' into LWPCK-3052-grouped-conv-split-image
JH-Leon-KIM-AMD Oct 9, 2025
58df1f6
Merge develop into LWPCK-3052: Accept universal GEMM pipeline, commen…
JH-Leon-KIM-AMD Oct 15, 2025
89c6c92
Split-Image implementation with temporary fixed divider
JH-Leon-KIM-AMD Oct 17, 2025
a779af6
Fix 1D split-image padding issue with per-piece dimensions
JH-Leon-KIM-AMD Oct 19, 2025
a7871c8
Fix 2D/3D split-image with independent split factors per dimension
JH-Leon-KIM-AMD Oct 19, 2025
c85669e
Remove unused split-image struct fields
JH-Leon-KIM-AMD Oct 19, 2025
d48b4ed
Refactor split-image invoker code for improved readability
JH-Leon-KIM-AMD Oct 19, 2025
49b622e
Refactor split-image code and remove debug prints
JH-Leon-KIM-AMD Oct 19, 2025
7db8d77
Add split-image safety constraints and refactor to utils
JH-Leon-KIM-AMD Oct 22, 2025
a87da59
Change split-image from runtime to compile-time branching
JH-Leon-KIM-AMD Oct 24, 2025
29fed44
Change split-image to compile-time branching
JH-Leon-KIM-AMD Oct 24, 2025
b28ea3c
Add split-image example as separate binary
JH-Leon-KIM-AMD Oct 25, 2025
badffd8
Replace linear search with binary search in find_piece_id
JH-Leon-KIM-AMD Oct 25, 2025
91ffc82
Simplify split-image code and fix integer overflow
JH-Leon-KIM-AMD Oct 27, 2025
d6184ed
Merge branch 'develop' into LWPCK-3052-grouped-conv-split-image
JH-Leon-KIM-AMD Oct 27, 2025
85c4c58
Trigger CI rerun - fix merge conflicts
JH-Leon-KIM-AMD Oct 28, 2025
61d0e6a
Fix merge conflict markers
JH-Leon-KIM-AMD Oct 28, 2025
02d33c3
Merge branch 'develop' into LWPCK-3052-grouped-conv-split-image
JH-Leon-KIM-AMD Oct 28, 2025
b8a94d5
Fix clang-format: remove space before {}
JH-Leon-KIM-AMD Oct 28, 2025
c80e237
Fix clang-format: comment wrapping and Swish constructor
JH-Leon-KIM-AMD Oct 28, 2025
387361d
Merge branch 'develop' into LWPCK-3052-grouped-conv-split-image
JH-Leon-KIM-AMD Oct 30, 2025
cc7299b
Rename split_image to large_tensor for clarity
JH-Leon-KIM-AMD Oct 30, 2025
8e03659
Update comments and include in large_tensor example
JH-Leon-KIM-AMD Oct 30, 2025
2fbb436
Remove test code, restore 2GB threshold
JH-Leon-KIM-AMD Oct 30, 2025
8558e07
Update include/ck_tile/ops/grouped_convolution/utils/transform_conv_f…
JH-Leon-KIM-AMD Oct 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions example/ck_tile/20_grouped_convolution/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@ set(EXAMPLE_CONV_COMPILE_OPTIONS)
list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)

add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

add_executable(tile_example_grouped_conv_fwd_large_tensor EXCLUDE_FROM_ALL grouped_convolution_forward_large_tensor.cpp)
target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

add_executable(tile_example_grouped_conv_fwd_bias_clamp EXCLUDE_FROM_ALL grouped_convolution_forward_bias_clamp.cpp)
target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})

add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

add_executable(tile_example_grouped_conv_bwd_weight_two_stage EXCLUDE_FROM_ALL grouped_convolution_backward_weight_two_stage.cpp)
target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp)
target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

// Regular grouped convolution invoker (no split-image)
// This invoker demonstrates regular convolution without split-image.
// It always uses Kernel<false> (split-image disabled).
// For large images that require split-image, use
// grouped_convolution_forward_split_image_invoker.hpp

#pragma once

#include "grouped_convolution_utils.hpp"
Expand All @@ -21,6 +28,10 @@ struct GroupedConvolutionForwardInvoker
static float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs<CDElementWise>& args,
const ck_tile::stream_config& s)
{
if(s.log_level_ > 0)
{
std::cout << "[INVOKER] grouped_conv_fwd called, NDimSpatial=" << NDimSpatial << "\n";
}
constexpr int kBlockPerCu = 1;

// Implicit GEMM Traits
Expand Down Expand Up @@ -90,107 +101,125 @@ struct GroupedConvolutionForwardInvoker
1,
std::multiplies<ck_tile::index_t>());

// Split-K parameters
const ck_tile::index_t k_grain = args.k_batch * GemmConfig::K_Tile;
const ck_tile::index_t K_split = (gemm_k + k_grain - 1) / k_grain * GemmConfig::K_Tile;
const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
float ave_time{0};

const auto Run =
[&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
constexpr bool has_hot_loop_v = has_hot_loop_.value;
constexpr auto tail_number_v = tail_number_.value;
constexpr auto scheduler = GemmConfig::Scheduler;
constexpr auto memory_operation = memory_operation_.value;

using UniversalGemmProblem =
ck_tile::UniversalGemmPipelineProblem<InDataType,
WeiDataType,
AccDataType,
GemmShape,
GemmUniversalTraits,
scheduler,
has_hot_loop_v,
tail_number_v,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
OutDataType,
true,
VectorSizeA,
VectorSizeB>;

using GemmPipeline = typename PipelineTypeTraits<
GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;

using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
InDataType,
WeiDataType,
DsDataType,
AccDataType,
OutDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
ck_tile::tensor_layout::gemm::RowMajor,
CDElementWise,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
GemmConfig::M_Warp,
GemmConfig::N_Warp,
GemmConfig::M_Warp_Tile,
GemmConfig::N_Warp_Tile,
GemmConfig::K_Warp_Tile,
GemmConfig::TransposeC,
memory_operation,
1,
true,
GroupedConvTraitsType::VectorSizeC>>;

using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
TilePartitioner,
GemmPipeline,
ConvEpilogue>;
auto kargs = Kernel::MakeKernelArgs(args);

const dim3 grids = Kernel::GridSize(kargs);
const dim3 blocks = Kernel::BlockSize();

if(!Kernel::IsSupportedArgument(kargs))
{
throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
}

if(s.log_level_ > 0)
{
std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
<< "shape: " << GemmShape::GetName() << '\n'
<< "problem: " << UniversalGemmProblem::GetName() << '\n'
<< "pipeline: " << GemmPipeline::GetName() << '\n'
<< "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
<< ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
<< "}" << '\n'
<< "Vector size A: " << GemmPipeline::GetVectorSizeA()
<< ", Vector size B: " << GemmPipeline::GetVectorSizeB()
<< ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
}

ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));

return ave_time;
};
// =====================================================================
// Regular Convolution: Simple, no split-image
// =====================================================================
const auto Run = [&]<bool EnableSplitImage>(const auto has_hot_loop_,
const auto tail_number_,
const auto memory_operation_) {
constexpr bool has_hot_loop_v = has_hot_loop_.value;
constexpr auto tail_number_v = tail_number_.value;
constexpr auto scheduler = GemmConfig::Scheduler;
constexpr auto memory_operation = memory_operation_.value;

using UniversalGemmProblem =
ck_tile::UniversalGemmPipelineProblem<InDataType,
WeiDataType,
AccDataType,
GemmShape,
GemmUniversalTraits,
scheduler,
has_hot_loop_v,
tail_number_v,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
OutDataType,
true,
VectorSizeA,
VectorSizeB>;

using GemmPipeline = typename PipelineTypeTraits<
GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;

using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
InDataType,
WeiDataType,
DsDataType,
AccDataType,
OutDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
ck_tile::tensor_layout::gemm::RowMajor,
CDEElementWise,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
GemmConfig::M_Warp,
GemmConfig::N_Warp,
GemmConfig::M_Warp_Tile,
GemmConfig::N_Warp_Tile,
GemmConfig::K_Warp_Tile,
GemmConfig::TransposeC,
memory_operation,
1,
true,
GroupedConvTraitsType::VectorSizeC>>;

using Kernel = ck_tile::GroupedConvolutionForwardKernel<EnableSplitImage,
GroupedConvTraitsType,
TilePartitioner,
GemmPipeline,
ConvEpilogue>;
auto kargs = Kernel::MakeKernelArgs(args);

const dim3 grids = Kernel::GridSize(kargs);
const dim3 blocks = Kernel::BlockSize();

if(!Kernel::IsSupportedArgument(kargs))
{
throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
}

if(s.log_level_ > 0)
{
std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
<< "shape: " << GemmShape::GetName() << '\n'
<< "problem: " << UniversalGemmProblem::GetName() << '\n'
<< "pipeline: " << GemmPipeline::GetName() << '\n'
<< "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
<< ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
<< "}" << '\n'
<< "Vector size A: " << GemmPipeline::GetVectorSizeA()
<< ", Vector size B: " << GemmPipeline::GetVectorSizeB()
<< ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
}

ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));

return ave_time;
};

// =====================================================================
// Split-K lambda
// =====================================================================
const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
if(args.k_batch == 1)
{
Run(has_hot_loop_, tail_number_, MemoryOpSet{});
Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpSet{});
}
else
{
Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
}
};

// =====================================================================
// Regular Convolution Example: ALWAYS uses regular path (Kernel<false>)
// =====================================================================
// This example demonstrates regular convolution without split-image.
// For large images that don't fit in memory, use
// grouped_convolution_forward_split_image.cpp

// Launch kernel using regular path (no split-image)
BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);

return ave_time;
}
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

// Large tensor grouped convolution example
// This example demonstrates convolution for large tensors that exceed memory limits.
// It uses automatic tensor splitting when needed to handle large images.
// For regular convolution without tensor splitting, use grouped_convolution_forward.cpp

#include <hip/hip_runtime.h>

#include <cstring>
#include <iostream>
#include <ostream>
#include <string>
#include <tuple>

#include "ck_tile/host.hpp"
#include "grouped_convolution_utils.hpp"
#include "grouped_convolution_forward_large_tensor_invoker.hpp"
#include "run_grouped_convolution_fwd_example.inc"

template <template <typename PrecType> typename GemmConfig>
int run_grouped_conv_fwd_example(int argc, char* argv[])
{
using Invoker = GroupedConvolutionForwardInvoker;

auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;

std::string data_type = arg_parser.get_str("prec");
std::string in_layout = arg_parser.get_str("in_layout");
std::string wei_layout = arg_parser.get_str("wei_layout");
std::string out_layout = arg_parser.get_str("out_layout");

if(data_type == "fp16")
{
return run_grouped_conv_fwd_example_prec_type<Invoker,
GemmConfig<ck_tile::half_t>,
ck_tile::half_t>(
in_layout, wei_layout, out_layout, argc, argv);
}
else if(data_type == "bf16")
{
return run_grouped_conv_fwd_example_prec_type<Invoker,
GemmConfig<ck_tile::bf16_t>,
ck_tile::bf16_t>(
in_layout, wei_layout, out_layout, argc, argv);
}
else
{
throw std::runtime_error("Unsupported data type for this operation !!!");
}
}

int main(int argc, char* argv[])
{
#if CK_TILE_USE_WMMA
return !run_grouped_conv_fwd_example<GemmConfigComputeV3_WMMA>(argc, argv);
#else
return !run_grouped_conv_fwd_example<GemmConfigComputeV3>(argc, argv);
#endif
}
Loading
Loading