[midend] Add matmul vectorization for decode phase.

zhanghb97 · zhanghb97 · commit 7df812ea39fd · 2025-11-09T05:01:29.000Z
diff --git a/examples/BuddyDeepSeekR1/CMakeLists.txt b/examples/BuddyDeepSeekR1/CMakeLists.txt
@@ -55,7 +55,7 @@ add_custom_command(
             -affine-parallelize
             -convert-vector-to-scf
             -lower-affine
-            -convert-scf-to-openmp=num-threads=32
+            -convert-scf-to-openmp=num-threads=48
             -cse
             -memref-expand
             -arith-expand
@@ -101,7 +101,7 @@ add_custom_command(
             -affine-parallelize
             -convert-vector-to-scf
             -lower-affine
-            -convert-scf-to-openmp=num-threads=32
+            -convert-scf-to-openmp=num-threads=48
             -func-bufferize-dynamic-offset
             -cse
             -memref-expand
@@ -147,7 +147,7 @@ add_custom_command(
             -affine-parallelize
             -convert-vector-to-scf
             -lower-affine
-            -convert-scf-to-openmp=num-threads=32
+            -convert-scf-to-openmp=num-threads=48
             -cse
             -memref-expand
             -arith-expand
@@ -187,14 +187,14 @@ add_custom_command(
             -buffer-deallocation-simplification
             -bufferization-lower-deallocations
             -assume-tight-memref-layout
-            -matmul-parallel-vectorization-optimize
+            -matmul-vectorization-decode
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
             -convert-vector-to-scf
             -lower-affine
-            -convert-scf-to-openmp=num-threads=32
+            -convert-scf-to-openmp=num-threads=48
             -func-bufferize-dynamic-offset
             -cse
             -memref-expand
@@ -238,7 +238,7 @@ add_custom_command(
             -affine-parallelize
             -convert-vector-to-scf
             -lower-affine
-            -convert-scf-to-openmp=num-threads=32
+            -convert-scf-to-openmp=num-threads=48
             -cse
             -memref-expand
             -arith-expand
@@ -287,7 +287,7 @@ add_custom_command(
             -affine-parallelize
             -convert-vector-to-scf
             -lower-affine
-            -convert-scf-to-openmp=num-threads=32
+            -convert-scf-to-openmp=num-threads=48
             -func-bufferize-dynamic-offset
             -canonicalize
             -cse
diff --git a/examples/BuddyDeepSeekR1/README.md b/examples/BuddyDeepSeekR1/README.md
@@ -80,18 +80,18 @@ $ cmake -G Ninja .. -DBUDDY_DEEPSEEKR1_EXAMPLES=ON
 
 //f32
 $ ninja buddy-deepseek-r1-run
-$ cd bin
-$ ./buddy-deepseek-r1-run
+$ ./bin/buddy-deepseek-r1-run
+
+// NUMA node binding
+numactl --cpunodebind=0,1 --membind=0,1 taskset -c 0-47 ./bin/buddy-deepseek-r1-run
 
 //f16
 $ ninja buddy-deepseek-r1-f16-run
-$ cd bin
-$ ./buddy-deepseek-r1-f16-run
+$ ./bin/buddy-deepseek-r1-f16-run
 
 //bf16
 $ ninja buddy-deepseek-r1-bf16-run
-$ cd bin
-$ ./buddy-deepseek-r1-bf16-run
+$ ./bin/buddy-deepseek-r1-bf16-run
 ```
 
 5. Enjoy it!
@@ -170,7 +170,7 @@ const std::string paramsDir = deepSeekR1Dir + "arg0.data";
         -mabi=lp64d
       )
     ```
-  
+
   The complete modified CMakeLists file is attached in appendix, you could copy and paste it directly.
 
 7. Build and run the model:
diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
@@ -2100,6 +2100,8 @@ next-matmul-transpose-op-lower:
 		-matmul-transpose-b-vectorization \
 		-o log.mlir
 
+NUM_THREADS := 48
+
 next-linalg-matmul-aot-omp:
 	@${MLIR_OPT} ./next-linalg-matmul.mlir \
 		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
@@ -2114,14 +2116,50 @@ next-linalg-matmul-aot-omp:
 		-convert-bufferization-to-memref \
 		-bufferization-lower-deallocations \
 		-assume-tight-memref-layout \
-		-matmul-parallel-vectorization-optimize \
+		-matmul-vectorization-decode \
 		-batchmatmul-optimize \
 		-convert-linalg-to-affine-loops \
 		-affine-loop-fusion \
 		-affine-parallelize \
 		-convert-vector-to-scf \
 		-lower-affine \
-		-convert-scf-to-openmp=num-threads=32 \
+		-convert-scf-to-openmp=num-threads=$(NUM_THREADS) \
+		-func-bufferize-dynamic-offset \
+		-cse \
+		-memref-expand \
+		-arith-expand \
+		-convert-vector-to-llvm \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-cf-to-llvm \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} -mlir-to-llvmir | \
+	${CLANG} -x ir - \
+		${MARCH_FLAG} -O3 \
+		-L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils -lomp -lm \
+		-Wl,-rpath,${MLIR_LIB} \
+		-o next-linalg-matmul.out
+	export OMP_NUM_THREADS=$(NUM_THREADS)
+	export OMP_PLACES=cores
+	export OMP_PROC_BIND=close
+	numactl --cpunodebind=0,1 --membind=0,1 \
+	taskset -c 0-47 \
+	./next-linalg-matmul.out || true
+
+next-linalg-matmul-decode-perf:
+	@${BUDDY_OPT} ./next-linalg-matmul-decode.mlir \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-affine-parallelize \
+		-convert-vector-to-scf \
+		-lower-affine \
+		-convert-scf-to-openmp=num-threads=$(NUM_THREADS) \
 		-func-bufferize-dynamic-offset \
 		-cse \
 		-memref-expand \
@@ -2143,4 +2181,10 @@ next-linalg-matmul-aot-omp:
 		-L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils -lomp -lm \
 		-Wl,-rpath,${MLIR_LIB} \
 		-o next-linalg-matmul.out
+	export OMP_NUM_THREADS=$(NUM_THREADS)
+	export OMP_PLACES=cores
+	export OMP_PROC_BIND=close
+	perf stat -r 5 -d \
+	numactl --cpunodebind=0,1 --membind=0,1 \
+	taskset -c 0-47 \
 	./next-linalg-matmul.out || true
diff --git a/examples/BuddyNext/next-linalg-matmul-decode.mlir b/examples/BuddyNext/next-linalg-matmul-decode.mlir
@@ -0,0 +1,160 @@
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -affine-parallelize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize-dynamic-offset \
+// RUN:     -cse \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-cf-to-llvm \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+#map = affine_map<(d0) -> (d0 mod 64)>
+#map1 = affine_map<(d0) -> (d0 ceildiv 64)>
+#map2 = affine_map<(d0) -> (d0)>
+module {
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func private @rtclock() -> f64
+  func.func @kernel(%arg0: memref<8960x1536xf32, strided<[?, ?], offset: ?>>) -> memref<1x1536xf32> {
+    %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %arg0 : memref<8960x1536xf32, strided<[?, ?], offset: ?>> -> memref<f32>, index, index, index, index, index
+    %b = memref.reinterpret_cast %base_buffer to offset: [%offset], sizes: [8960, 1536], strides: [%strides#0, 1] : memref<f32> to memref<8960x1536xf32, strided<[?, 1], offset: ?>>
+    %true = arith.constant true
+    %cst = arith.constant 4.000000e+00 : f32
+    %cst_0 = arith.constant 2.000000e+00 : f32
+    %a = memref.alloc() {alignment = 64 : i64} : memref<1x8960xf32>
+    linalg.fill ins(%cst_0 : f32) outs(%a : memref<1x8960xf32>)
+    %c = memref.alloc() {alignment = 64 : i64} : memref<1x1536xf32>
+    linalg.fill ins(%cst : f32) outs(%c : memref<1x1536xf32>)
+    %0 = call @rtclock() : () -> f64
+
+    memref.assume_alignment %a, 64 : memref<1x8960xf32>
+    memref.assume_alignment %b, 64 : memref<8960x1536xf32, strided<[?, 1], offset: ?>>
+    memref.assume_alignment %c, 64 : memref<1x1536xf32>
+
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %step = arith.constant 32 : index
+    %prefetch_step = arith.constant 1024 : index
+    %m = arith.constant 1 : index
+    %n = arith.constant 1536 : index
+    %k = arith.constant 8960 : index
+
+    scf.parallel (%n_idx) = (%c0) to (%n) step (%step) {
+      %c_vec = vector.load %c[%c0, %n_idx] {alignment = 64 : i64} : memref<1x1536xf32>, vector<32xf32>
+      %sum_iter = scf.for %k_idx = %c0 to %k step %c1 iter_args(%sum_vec = %c_vec) -> (vector<32xf32>) {
+        %k_prefetch = arith.addi %k_idx, %prefetch_step : index
+        memref.prefetch %b[%k_prefetch, %n_idx], read, locality<0>, data : memref<8960x1536xf32, strided<[?, 1], offset: ?>>
+        %a_ele = memref.load %a[%c0, %k_idx] : memref<1x8960xf32>
+        %a_vec = vector.broadcast %a_ele : f32 to vector<32xf32>
+        %b_vec = vector.load %b[%k_idx, %n_idx] {alignment = 64 : i64, nontemporal = true} : memref<8960x1536xf32, strided<[?, 1], offset: ?>>, vector<32xf32>
+        %r_vec = vector.fma %a_vec, %b_vec, %sum_vec : vector<32xf32>
+        scf.yield %r_vec : vector<32xf32>
+      }
+      vector.store %sum_iter, %c[%c0, %n_idx] {alignment = 64 : i64} : memref<1x1536xf32>, vector<32xf32>
+    }
+
+    %5 = call @rtclock() : () -> f64
+    %6 = arith.subf %5, %0 : f64
+    vector.print %6 : f64
+    // CHECK: {{[0-9]+\.[0-9]+}}
+    return %c : memref<1x1536xf32>
+  }
+  func.func @main() {
+    %true = arith.constant true
+    %cst = arith.constant 3.000000e+00 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<8960x1536xf32>
+    linalg.fill ins(%cst : f32) outs(%alloc : memref<8960x1536xf32>)
+    %cast = memref.cast %alloc : memref<8960x1536xf32> to memref<8960x1536xf32, strided<[?, ?], offset: ?>>
+    %0 = call @kernel(%cast) : (memref<8960x1536xf32, strided<[?, ?], offset: ?>>) -> memref<1x1536xf32>
+    %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %0 : memref<1x1536xf32> -> memref<f32>, index, index, index, index, index
+    %alloc_0 = memref.alloc() : memref<2xindex>
+    %alloc_1 = memref.alloc() : memref<2xi1>
+    %alloc_2 = memref.alloc() : memref<0xindex>
+    %intptr = memref.extract_aligned_pointer_as_index %alloc : memref<8960x1536xf32> -> index
+    %c0 = arith.constant 0 : index
+    memref.store %intptr, %alloc_0[%c0] : memref<2xindex>
+    %intptr_3 = memref.extract_aligned_pointer_as_index %base_buffer : memref<f32> -> index
+    %c1 = arith.constant 1 : index
+    memref.store %intptr_3, %alloc_0[%c1] : memref<2xindex>
+    %c0_4 = arith.constant 0 : index
+    memref.store %true, %alloc_1[%c0_4] : memref<2xi1>
+    %c1_5 = arith.constant 1 : index
+    memref.store %true, %alloc_1[%c1_5] : memref<2xi1>
+    %cast_6 = memref.cast %alloc_0 : memref<2xindex> to memref<?xindex>
+    %cast_7 = memref.cast %alloc_1 : memref<2xi1> to memref<?xi1>
+    %cast_8 = memref.cast %alloc_2 : memref<0xindex> to memref<?xindex>
+    %alloc_9 = memref.alloc() : memref<2xi1>
+    %alloc_10 = memref.alloc() : memref<0xi1>
+    %cast_11 = memref.cast %alloc_9 : memref<2xi1> to memref<?xi1>
+    %cast_12 = memref.cast %alloc_10 : memref<0xi1> to memref<?xi1>
+    call @dealloc_helper(%cast_6, %cast_8, %cast_7, %cast_11, %cast_12) : (memref<?xindex>, memref<?xindex>, memref<?xi1>, memref<?xi1>, memref<?xi1>) -> ()
+    %c0_13 = arith.constant 0 : index
+    %1 = memref.load %alloc_9[%c0_13] : memref<2xi1>
+    scf.if %1 {
+      memref.dealloc %alloc : memref<8960x1536xf32>
+    }
+    %c1_14 = arith.constant 1 : index
+    %2 = memref.load %alloc_9[%c1_14] : memref<2xi1>
+    scf.if %2 {
+      memref.dealloc %base_buffer : memref<f32>
+    }
+    memref.dealloc %alloc_0 : memref<2xindex>
+    memref.dealloc %alloc_2 : memref<0xindex>
+    memref.dealloc %alloc_1 : memref<2xi1>
+    memref.dealloc %alloc_9 : memref<2xi1>
+    memref.dealloc %alloc_10 : memref<0xi1>
+    return
+  }
+  func.func private @dealloc_helper(%arg0: memref<?xindex>, %arg1: memref<?xindex>, %arg2: memref<?xi1>, %arg3: memref<?xi1>, %arg4: memref<?xi1>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %true = arith.constant true
+    %false = arith.constant false
+    %dim = memref.dim %arg0, %c0 : memref<?xindex>
+    %dim_0 = memref.dim %arg1, %c0 : memref<?xindex>
+    scf.for %arg5 = %c0 to %dim_0 step %c1 {
+      memref.store %false, %arg4[%arg5] : memref<?xi1>
+    }
+    scf.for %arg5 = %c0 to %dim step %c1 {
+      %0 = memref.load %arg0[%arg5] : memref<?xindex>
+      %1 = memref.load %arg2[%arg5] : memref<?xi1>
+      %2 = scf.for %arg6 = %c0 to %dim_0 step %c1 iter_args(%arg7 = %true) -> (i1) {
+        %5 = memref.load %arg1[%arg6] : memref<?xindex>
+        %6 = arith.cmpi eq, %5, %0 : index
+        scf.if %6 {
+          %9 = memref.load %arg4[%arg6] : memref<?xi1>
+          %10 = arith.ori %9, %1 : i1
+          memref.store %10, %arg4[%arg6] : memref<?xi1>
+        }
+        %7 = arith.cmpi ne, %5, %0 : index
+        %8 = arith.andi %arg7, %7 : i1
+        scf.yield %8 : i1
+      }
+      %3 = scf.for %arg6 = %c0 to %arg5 step %c1 iter_args(%arg7 = %2) -> (i1) {
+        %5 = memref.load %arg0[%arg6] : memref<?xindex>
+        %6 = arith.cmpi ne, %5, %0 : index
+        %7 = arith.andi %arg7, %6 : i1
+        scf.yield %7 : i1
+      }
+      %4 = arith.andi %3, %1 : i1
+      memref.store %4, %arg3[%arg5] : memref<?xi1>
+    }
+    return
+  }
+}
diff --git a/examples/BuddyNext/next-linalg-matmul.mlir b/examples/BuddyNext/next-linalg-matmul.mlir
@@ -11,7 +11,7 @@
 // RUN:     -bufferization-lower-deallocations \
 // RUN:     -convert-bufferization-to-memref \
 // RUN:     -assume-tight-memref-layout \
-// RUN:     -matmul-parallel-vectorization-optimize \
+// RUN:     -matmul-vectorization-decode \
 // RUN:     -batchmatmul-optimize \
 // RUN:     -convert-linalg-to-affine-loops \
 // RUN:     -affine-loop-fusion \
diff --git a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_library(MatMulOptimization
-	MatMulOptimize.cpp
+  MatMulOptimize.cpp
   MatMulVectorization.cpp
+  MatMulVectorizationDecode.cpp
   MatMulParallelVectorization.cpp
   MatMulBlisVectorization.cpp
   BatchMatMulOptimize.cpp
@@ -30,5 +31,5 @@ add_mlir_library(MatMulTransposeBVec
 )
 
 add_mlir_library(MatMulBlisVectorization
-  MatMulBlisVectorization.cpp 
+  MatMulBlisVectorization.cpp
 )
diff --git a/midend/lib/Conversion/MatMulOptimization/MatMulVectorizationDecode.cpp b/midend/lib/Conversion/MatMulOptimization/MatMulVectorizationDecode.cpp
diff --git a/midend/lib/InitAll.cpp b/midend/lib/InitAll.cpp
diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`add_mlir_library(MatMulOptimization`
`2`		`- MatMulOptimize.cpp`
	`2`	`+ MatMulOptimize.cpp`
`3`	`3`	`MatMulVectorization.cpp`
	`4`	`+ MatMulVectorizationDecode.cpp`
`4`	`5`	`MatMulParallelVectorization.cpp`
`5`	`6`	`MatMulBlisVectorization.cpp`
`6`	`7`	`BatchMatMulOptimize.cpp`
`@@ -30,5 +31,5 @@ add_mlir_library(MatMulTransposeBVec`
`30`	`31`	`)`
`31`	`32`
`32`	`33`	`add_mlir_library(MatMulBlisVectorization`
`33`		`- MatMulBlisVectorization.cpp`
	`34`	`+ MatMulBlisVectorization.cpp`
`34`	`35`	`)`