Skip to content

Commit e20fa4f

Browse files
tgymnichkuhar
andauthored
[mlir][AMDGPU] Add PermlaneSwapOp (llvm#154345)
- Add PermlaneSwapOp that lowers to `rocdl.permlane16.swap` and `rocdl.permlane32.swap` --------- Co-authored-by: Jakub Kuderski <[email protected]>
1 parent fc62990 commit e20fa4f

File tree

5 files changed

+281
-1
lines changed

5 files changed

+281
-1
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,48 @@ def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode",
656656
}];
657657
}
658658

659+
def AMDGPU_PermlaneSwapOp : AMDGPU_Op<"permlane_swap", [Pure, AllTypesMatch<["result", "src"]>]> {
660+
let summary = "AMDGPU permlane swap op";
661+
let description = [{
662+
High-level wrapper on `rocdl.permlane{16,32}.swap` variants for permutations
663+
on rows of lanes in a subgroup.
664+
665+
Supports arbitrary int/float/vector types, which will be repacked to i32 and
666+
one or more `rocdl.permlane_swap` ops during lowering.
667+
Supported lane permutations:
668+
- Swap the data between odd and even rows of 16 lanes
669+
- Swap the data between the first 32 lanes and the last 32 lanes
670+
671+
Example:
672+
```mlir
673+
%0 = amdgpu.permlane %src 16 : f16
674+
%1 = amdgpu.permlane %src 32 { fetch_inactive = true, bound_ctrl = true } : f16
675+
```
676+
677+
Operands:
678+
* `$src`: Vector register to permute across lanes of the subgroup.
679+
* `$row_length`: The length of a row to permute in number of lanes (valid values are 16 and 32).
680+
* `$fetch_inactive`: Optional. Used to dertermine behavior of a fetch from a disabled lane.
681+
`fetch_inactive = false`: If the source lane is disabled, use `bound_ctrl` to determine the source value.
682+
`fetch_inactive = true`: If the source lane is disabled, fetch the source value anyway (ignoring `bound_ctrl`).
683+
* `$bound_ctrl`: Optional. Used to determine what a thread should do if its source operand is from
684+
a disabled lane: use the value zero, or disable the write.
685+
`bound_ctrl = false`: Do not write when source is from a disabled lane
686+
`bound_ctrl = true`: Use zero as input if source is from a disabled lane
687+
688+
Note: Lowering is only supported on gfx950 and up.
689+
}];
690+
let arguments = (ins AnyIntegerOrFloatOr1DVector:$src,
691+
I32Attr:$row_length,
692+
DefaultValuedAttr<BoolAttr, "false">:$fetch_inactive,
693+
DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl);
694+
let results = (outs AnyIntegerOrFloatOr1DVector:$result);
695+
let assemblyFormat = [{
696+
$src $row_length attr-dict `:` type($result)
697+
}];
698+
let hasVerifier = 1;
699+
}
700+
659701
def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
660702
let summary = "Barrier that includes a wait for LDS memory operations.";
661703
let description = [{

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
1515
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
1616
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
17+
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
1718
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
1819
#include "mlir/IR/BuiltinTypes.h"
1920
#include "mlir/IR/TypeUtilities.h"
@@ -1876,6 +1877,54 @@ struct AMDGPUSwizzleBitModeLowering
18761877
}
18771878
};
18781879

1880+
struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
1881+
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
1882+
1883+
AMDGPUPermlaneLowering(const LLVMTypeConverter &converter, Chipset chipset)
1884+
: ConvertOpToLLVMPattern<PermlaneSwapOp>(converter), chipset(chipset) {}
1885+
Chipset chipset;
1886+
1887+
LogicalResult
1888+
matchAndRewrite(PermlaneSwapOp op, OpAdaptor adaptor,
1889+
ConversionPatternRewriter &rewriter) const override {
1890+
if (chipset < kGfx950)
1891+
return op->emitOpError("permlane_swap is only supported on gfx950+");
1892+
1893+
Location loc = op.getLoc();
1894+
Type i32 = rewriter.getI32Type();
1895+
Value src = adaptor.getSrc();
1896+
unsigned row_length = op.getRowLength();
1897+
bool fi = op.getFetchInactive();
1898+
bool boundctrl = op.getBoundCtrl();
1899+
1900+
SmallVector<Value> decomposed =
1901+
LLVM::decomposeValue(rewriter, loc, src, i32);
1902+
1903+
SmallVector<Value> permuted;
1904+
for (Value v : decomposed) {
1905+
Value res;
1906+
Type i32pair = LLVM::LLVMStructType::getLiteral(
1907+
rewriter.getContext(), {v.getType(), v.getType()});
1908+
1909+
if (row_length == 16)
1910+
res = ROCDL::Permlane16SwapOp::create(rewriter, loc, i32pair, v, v, fi,
1911+
boundctrl);
1912+
else if (row_length == 32)
1913+
res = ROCDL::Permlane32SwapOp::create(rewriter, loc, i32pair, v, v, fi,
1914+
boundctrl);
1915+
else
1916+
llvm_unreachable("unsupported row length");
1917+
1918+
Value vdstNew = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
1919+
permuted.emplace_back(vdstNew);
1920+
}
1921+
1922+
Value result = LLVM::composeValue(rewriter, loc, permuted, src.getType());
1923+
rewriter.replaceOp(op, result);
1924+
return success();
1925+
}
1926+
};
1927+
18791928
struct ConvertAMDGPUToROCDLPass
18801929
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
18811930
using Base::Base;
@@ -1944,6 +1993,6 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
19441993
WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
19451994
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
19461995
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
1947-
TransposeLoadOpLowering>(converter, chipset);
1996+
TransposeLoadOpLowering, AMDGPUPermlaneLowering>(converter, chipset);
19481997
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
19491998
}

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,18 @@ LogicalResult DPPOp::verify() {
510510
return success();
511511
}
512512

513+
//===----------------------------------------------------------------------===//
514+
// PermlaneSwapOp
515+
//===----------------------------------------------------------------------===//
516+
LogicalResult PermlaneSwapOp::verify() {
517+
unsigned rowLength = getRowLength();
518+
519+
if (rowLength != 16 && rowLength != 32)
520+
return emitOpError("row_length attribute must either be 16 or 32.");
521+
522+
return success();
523+
}
524+
513525
//===----------------------------------------------------------------------===//
514526
// GatherToLDSOp
515527
//===----------------------------------------------------------------------===//
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
// RUN: mlir-opt --convert-amdgpu-to-rocdl=chipset=gfx950 --canonicalize %s | FileCheck %s
2+
3+
// CHECK-LABEL: func @test_permlane16_i32
4+
// CHECK-SAME: (%[[ARG0:.*]]: i32)
5+
func.func @test_permlane16_i32(%arg0 : i32) -> i32 {
6+
// CHECK: %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
7+
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
8+
// CHECK: return %[[RES]] : i32
9+
%0 = amdgpu.permlane_swap %arg0 16 : i32
10+
return %0 : i32
11+
}
12+
13+
// CHECK-LABEL: func @test_permlane16_i32_optional_attr
14+
// CHECK-SAME: (%[[ARG0:.*]]: i32)
15+
func.func @test_permlane16_i32_optional_attr(%arg0 : i32) -> i32 {
16+
// CHECK: %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], true, true : (i32, i32) -> <(i32, i32)>
17+
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
18+
// CHECK: return %[[RES]] : i32
19+
%0 = amdgpu.permlane_swap %arg0 16 { fetch_inactive = true, bound_ctrl = true } : i32
20+
return %0 : i32
21+
}
22+
23+
// CHECK-LABEL: func @test_permlane32_i32
24+
// CHECK-SAME: (%[[ARG0:.*]]: i32)
25+
func.func @test_permlane32_i32(%arg0 : i32) -> i32 {
26+
// CHECK: %[[PERM:.*]] = rocdl.permlane32.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
27+
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
28+
// CHECK: return %[[RES]] : i32
29+
%0 = amdgpu.permlane_swap %arg0 32 : i32
30+
return %0 : i32
31+
}
32+
33+
// CHECK-LABEL: func @test_permlane16_f32
34+
// CHECK-SAME: (%[[ARG0:.*]]: f32)
35+
func.func @test_permlane16_f32(%arg0 : f32) -> f32 {
36+
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f32 to i32
37+
// CHECK: %[[PERM:.*]] = rocdl.permlane16.swap %[[CAST]], %[[CAST]], false, false : (i32, i32) -> <(i32, i32)>
38+
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
39+
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
40+
// CHECK: return %[[RES_CAST]] : f32
41+
%0 = amdgpu.permlane_swap %arg0 16 : f32
42+
return %0 : f32
43+
}
44+
45+
// CHECK-LABEL: func @test_permlane32_f32
46+
// CHECK-SAME: (%[[ARG0:.*]]: f32)
47+
func.func @test_permlane32_f32(%arg0 : f32) -> f32 {
48+
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f32 to i32
49+
// CHECK: %[[PERM:.*]] = rocdl.permlane32.swap %[[CAST]], %[[CAST]], false, false : (i32, i32) -> <(i32, i32)>
50+
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
51+
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
52+
// CHECK: return %[[RES_CAST]] : f32
53+
%0 = amdgpu.permlane_swap %arg0 32 : f32
54+
return %0 : f32
55+
}
56+
57+
// CHECK-LABEL: func @test_permlane16_f16
58+
// CHECK-SAME: (%[[ARG0:.*]]: f16)
59+
func.func @test_permlane16_f16(%arg0 : f16) -> f16 {
60+
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f16 to i16
61+
// CHECK: %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
62+
// CHECK: %[[PERM:.*]] = rocdl.permlane16.swap %[[ZEXT]], %[[ZEXT]], false, false : (i32, i32) -> <(i32, i32)>
63+
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
64+
// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
65+
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
66+
// CHECK: return %[[RES_CAST]] : f16
67+
%0 = amdgpu.permlane_swap %arg0 16 : f16
68+
return %0 : f16
69+
}
70+
71+
// CHECK-LABEL: func @test_permlane32_f16
72+
// CHECK-SAME: (%[[ARG0:.*]]: f16)
73+
func.func @test_permlane32_f16(%arg0 : f16) -> f16 {
74+
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f16 to i16
75+
// CHECK: %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
76+
// CHECK: %[[PERM:.*]] = rocdl.permlane32.swap %[[ZEXT]], %[[ZEXT]], false, false : (i32, i32) -> <(i32, i32)>
77+
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
78+
// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
79+
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
80+
// CHECK: return %[[RES_CAST]] : f16
81+
%0 = amdgpu.permlane_swap %arg0 32 : f16
82+
return %0 : f16
83+
}
84+
85+
// CHECK-LABEL: func @test_permlane16_2xi32
86+
// CHECK-SAME: (%[[ARG0:.*]]: vector<2xi32>)
87+
func.func @test_permlane16_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
88+
// CHECK-DAG: %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
89+
// CHECK-DAG: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
90+
// CHECK-DAG: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
91+
// CHECK: %[[ELEM0:.*]] = llvm.extractelement %[[ARG0]][%[[C0]] : i32] : vector<2xi32>
92+
// CHECK: %[[ELEM1:.*]] = llvm.extractelement %[[ARG0]][%[[C1]] : i32] : vector<2xi32>
93+
// CHECK: %[[PERM0_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
94+
// CHECK: %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
95+
// CHECK: %[[PERM1_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
96+
// CHECK: %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
97+
// CHECK: %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
98+
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
99+
// CHECK: return %[[VEC_INSERT1]] : vector<2xi32>
100+
%0 = amdgpu.permlane_swap %arg0 16 : vector<2xi32>
101+
return %0 : vector<2xi32>
102+
}
103+
104+
// CHECK-LABEL: func @test_permlane32_2xi32
105+
// CHECK-SAME: (%[[ARG0:.*]]: vector<2xi32>)
106+
func.func @test_permlane32_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
107+
// CHECK-DAG: %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
108+
// CHECK-DAG: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
109+
// CHECK-DAG: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
110+
// CHECK: %[[ELEM0:.*]] = llvm.extractelement %[[ARG0]][%[[C0]] : i32] : vector<2xi32>
111+
// CHECK: %[[ELEM1:.*]] = llvm.extractelement %[[ARG0]][%[[C1]] : i32] : vector<2xi32>
112+
// CHECK: %[[PERM0_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
113+
// CHECK: %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
114+
// CHECK: %[[PERM1_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
115+
// CHECK: %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
116+
// CHECK: %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
117+
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
118+
// CHECK: return %[[VEC_INSERT1]] : vector<2xi32>
119+
%0 = amdgpu.permlane_swap %arg0 32 : vector<2xi32>
120+
return %0 : vector<2xi32>
121+
}
122+
123+
// CHECK-LABEL: func @test_permlane16_4xf16
124+
// CHECK-SAME: (%[[ARG0:.*]]: vector<4xf16>)
125+
func.func @test_permlane16_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
126+
// CHECK-DAG: %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
127+
// CHECK-DAG: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
128+
// CHECK-DAG: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
129+
// CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<4xf16> to vector<2xi32>
130+
// CHECK: %[[ELEM0:.*]] = llvm.extractelement %[[CAST1]][%[[C0]] : i32] : vector<2xi32>
131+
// CHECK: %[[ELEM1:.*]] = llvm.extractelement %[[CAST1]][%[[C1]] : i32] : vector<2xi32>
132+
// CHECK: %[[PERM0_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
133+
// CHECK: %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
134+
// CHECK: %[[PERM1_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
135+
// CHECK: %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
136+
// CHECK: %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
137+
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
138+
// CHECK: %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
139+
// CHECK: return %[[CAST2]] : vector<4xf16>
140+
%0 = amdgpu.permlane_swap %arg0 16 : vector<4xf16>
141+
return %0 : vector<4xf16>
142+
}
143+
144+
// CHECK-LABEL: func @test_permlane32_4xf16
145+
// CHECK-SAME: (%[[ARG0:.*]]: vector<4xf16>)
146+
func.func @test_permlane32_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
147+
// CHECK-DAG: %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
148+
// CHECK-DAG: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
149+
// CHECK-DAG: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
150+
// CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<4xf16> to vector<2xi32>
151+
// CHECK: %[[ELEM0:.*]] = llvm.extractelement %[[CAST1]][%[[C0]] : i32] : vector<2xi32>
152+
// CHECK: %[[ELEM1:.*]] = llvm.extractelement %[[CAST1]][%[[C1]] : i32] : vector<2xi32>
153+
// CHECK: %[[PERM0_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
154+
// CHECK: %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
155+
// CHECK: %[[PERM1_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
156+
// CHECK: %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
157+
// CHECK: %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
158+
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
159+
// CHECK: %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
160+
// CHECK: return %[[CAST2]] : vector<4xf16>
161+
%0 = amdgpu.permlane_swap %arg0 32 : vector<4xf16>
162+
return %0 : vector<4xf16>
163+
}

mlir/test/Dialect/AMDGPU/ops.mlir

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,20 @@ func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
524524
func.return %0 : f32
525525
}
526526

527+
// CHECK-LABEL: func @permlane16_swap
528+
func.func @permlane16_swap(%arg0 : f32) -> f32 {
529+
// CHECK: amdgpu.permlane_swap
530+
%0 = amdgpu.permlane_swap %arg0 16 : f32
531+
func.return %0 : f32
532+
}
533+
534+
// CHECK-LABEL: func @permlane32_swap
535+
func.func @permlane32_swap(%arg0 : f32) -> f32 {
536+
// CHECK: amdgpu.permlane_swap
537+
%0 = amdgpu.permlane_swap %arg0 32 : f32
538+
func.return %0 : f32
539+
}
540+
527541
// CHECK-LABEL: func @scaled_mfma
528542
func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
529543
// CHECK: amdgpu.scaled_mfma

0 commit comments

Comments
 (0)