added not unit stride support for input gradient

a-sidorova · a-sidorova · commit 404b8ab773d5 · 2025-11-21T16:02:07.000+04:00
diff --git a/lib/Conversion/TorchToLinalg/Linear.cpp b/lib/Conversion/TorchToLinalg/Linear.cpp
@@ -1826,36 +1826,75 @@ class ConvertAtenConvolutionBackwardOp : public OpConversionPattern<AtenConvolut
           torch_to_linalg::flipTensor(rewriter, loc, weightExpanded, kernelFlipDims);
 
       // For backward-input, padding must be adjusted to:
-      //   pad_bwd[i] = dilation[i] * (kernel_size[i] - 1) - pad_fwd[i]
-      Value cstOne = arith::ConstantOp::create(rewriter, loc, rewriter.getI64IntegerAttr(1));
+      //   p'[i] = d[i] * (K[i] - 1) - p[i]
+      Value c1 = arith::ConstantOp::create(rewriter, loc, rewriter.getI64IntegerAttr(1));
       Value padVal = arith::ConstantOp::create(
         rewriter, loc, rewriter.getFloatAttr(gradOutputDTy, 0.0));
-      SmallVector<Value> gradOutputPaddingValues(numSpatialDims);
       SmallVector<Value> dilationIntValues =
         getAsConstantIntValues(rewriter, loc, dilationInts);
+      SmallVector<Value> weiSizes = getTensorSizes(rewriter, loc, weightExpanded);
+      SmallVector<Value> gradOutputPaddingValues(numSpatialDims);
       for (size_t i = 0; i < numSpatialDims; ++i) {
-        Value kSize = castIndexToInt64(rewriter, loc, getDimOp(rewriter, loc, weightExpanded, spatialStartDimIdx + i));
+        Value kSize = castIndexToInt64(rewriter, loc, weiSizes[spatialStartDimIdx + i]);
         Value kMinusOne = rewriter.createOrFold<arith::SubIOp>(
-          loc, kSize, cstOne);
+          loc, kSize, c1);
         Value dilated = rewriter.createOrFold<arith::MulIOp>(
-          loc, kMinusOne, castIntToIndex(rewriter, loc, dilationIntValues[i]));
+          loc, kMinusOne, dilationIntValues[i]);
         gradOutputPaddingValues[i] = arith::SubIOp::create(rewriter, loc, dilated, paddingIntValues[i]);
+
+        if (isValueNegative(gradOutputPaddingValues[i]))
+          return rewriter.notifyMatchFailure(
+            op, "unimplemented: negative padding values are not supported.");
       }
 
-      bool do_insert_slice = llvm::any_of(strideInts, [](int64_t stride) { return stride > 1; });
-      if (do_insert_slice) {
-        return rewriter.notifyMatchFailure(
-          op, "unimplemented: do_insert_slice");
+      // If there are not unit strides, we have to scatter `grad_output` into a zero-initialized tensor.
+      SmallVector<Value> gradInputSizes = getTensorSizes(rewriter, loc, input);
+      Value gradOutputSliced;
+      if (llvm::any_of(strideInts, [](int64_t stride) { return stride > 1; })) {
+        // Destination spatial sizes are computed as:
+        //   size[i] = (D[i] - 1) + d[i] * (K[i] - 1) + 1
+        // Offsets on spatial dims are paddings
+        // Strides on spatial dims are the original stride[i].
+        Value zero = arith::ConstantOp::create(rewriter, loc, rewriter.getIndexAttr(0));
+        Value one = arith::ConstantOp::create(rewriter, loc, rewriter.getIndexAttr(1));
+
+        // Initialize slice strides, sizes and offsets
+        SmallVector<Value> goSizes = getTensorSizes(rewriter, loc, gradOutputExpanded);
+        SmallVector<Value> sizes(goSizes.begin(), goSizes.begin() + spatialStartDimIdx);
+        SmallVector<Value> offsets(spatialStartDimIdx, zero);
+        SmallVector<Value> strides(spatialStartDimIdx, one);
+        for (size_t i = 0; i < numSpatialDims; ++i) {
+          // Shapes of `grad_input` are collapsed here
+          Value h = gradInputSizes[2 + i];
+          Value k = weiSizes[spatialStartDimIdx + i];
+          Value hMinusOne = rewriter.createOrFold<arith::SubIOp>(loc, h, one);
+          Value kMinusOne = rewriter.createOrFold<arith::SubIOp>(loc, k, one);
+          Value mul = rewriter.createOrFold<arith::MulIOp>(
+              loc, castIntToIndex(rewriter, loc, dilationIntValues[i]), kMinusOne);
+          Value sum = rewriter.createOrFold<arith::AddIOp>(loc, hMinusOne, mul);
+          sizes.push_back(rewriter.createOrFold<arith::AddIOp>(loc, sum, one));
+          offsets.push_back(castIntToIndex(rewriter, loc, gradOutputPaddingValues[i]));
+
+          Value strideIntValue = arith::ConstantOp::create(
+            rewriter, loc, rewriter.getI64IntegerAttr(strideInts[i]));
+          strides.push_back(castIntToIndex(rewriter, loc, strideIntValue));
+        }
+
+        Value zeroInit =
+          createZeroInitTensor(rewriter, loc, sizes, gradOutputDTy);
+          gradOutputSliced = tensor::InsertSliceOp::create(
+            rewriter, loc, torch_to_linalg::removeSizeInformation(rewriter, loc, gradOutputExpanded),
+            zeroInit, offsets, goSizes, strides);
       } else {
-        // Pad `grad_output` spatial dims with zeros. If grouped, input has shape:
-        // N x G x F/G x <spatial>. Otherwise: N x F x <spatial>.
-        gradOutputExpanded = torch_to_linalg::getDynamicZeroPaddedTensor(
+        // If there unit strides, pad `grad_output` spatial dims with zeros.
+        // If conv is grouped, output has shape:
+        //  N x G x F/G x <spatial>. Otherwise: N x F x <spatial>.
+        gradOutputSliced = torch_to_linalg::getDynamicZeroPaddedTensor(
             op, rewriter, gradOutputExpanded, gradOutputPaddingValues, spatialStartDimIdx, padVal);
       }
 
       // Initialize output buffer. For grouped, compute into an expanded
       // [N, G, C/G, D*] tensor and collapse back to the original input shape.
-      SmallVector<Value> gradInputSizes = getTensorSizes(rewriter, loc, input);
       Value gradInputInit =
           createZeroInitTensor(rewriter, loc, gradInputSizes, inputDTy);
       SmallVector<ReassociationIndices> gradInputCollapseIndices;
@@ -1975,7 +2014,7 @@ class ConvertAtenConvolutionBackwardOp : public OpConversionPattern<AtenConvolut
 
       auto genericRes = linalg::GenericOp::create(
                             rewriter, loc, gradInputInit.getType(),
-                            ValueRange{gradOutputExpanded, weightExpanded},
+                            ValueRange{gradOutputSliced, weightExpanded},
                             gradInputInit, indexingMaps, iteratorTypes,
                             [&](OpBuilder &b, Location loc, ValueRange args) {
                               Value grad = args[0];
diff --git a/test/Conversion/TorchToLinalg/convolution_bwd.mlir b/test/Conversion/TorchToLinalg/convolution_bwd.mlir
@@ -59,6 +59,64 @@ func.func @convolution_backward_input_1x1s_0x0p_1x1d_1g(%arg0: !torch.vtensor<[2
 
 // -----
 
+// CHECK-LABEL:   func.func @convolution_backward_input_2x2s_2x2p_2x2d_1g(
+// CHECK-SAME:                                                %[[VAL_0:.*]]: !torch.vtensor<[2,16,33,33],f32>, %[[VAL_1:.*]]: !torch.vtensor<[2,128,64,64],f32>,
+// CHECK-SAME:                                                %[[VAL_2:.*]]: !torch.vtensor<[16,128,2,2],f32>,
+// CHECK-SAME:                                                %[[VAL_3:.*]]: !torch.vtensor<[],f32>) -> (!torch.vtensor<[2,128,64,64],f32>, !torch.vtensor<[16],f32>) {
+func.func @convolution_backward_input_2x2s_2x2p_2x2d_1g(%arg0: !torch.vtensor<[2,16,33,33],f32>, %arg1: !torch.vtensor<[2,128,64,64],f32>, %arg2: !torch.vtensor<[16,128,2,2],f32>, %arg3: !torch.vtensor<[],f32>) -> (!torch.vtensor<[2,128,64,64],f32>, !torch.vtensor<[16],f32>) {
+  // CHECK:           %[[CST1:.*]] = arith.constant 1 : index
+  // CHECK:           %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK:           %[[T1:.*]] = torch_c.to_builtin_tensor %[[VAL_2]] : !torch.vtensor<[16,128,2,2],f32> -> tensor<16x128x2x2xf32>
+  // CHECK:           %[[T0:.*]] = torch_c.to_builtin_tensor %[[VAL_0]] : !torch.vtensor<[2,16,33,33],f32> -> tensor<2x16x33x33xf32>
+  // CHECK:           %[[W_EMPTY:.*]] = tensor.empty() : tensor<16x128x2x2xf32>
+  // CHECK:           %[[W_FILLED:.*]] = linalg.fill ins(%[[CST0]] : f32) outs(%[[W_EMPTY]] : tensor<16x128x2x2xf32>) -> tensor<16x128x2x2xf32>
+  // CHECK:           %[[W_REV:.*]] = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[T1]] : tensor<16x128x2x2xf32>) outs(%[[W_FILLED]] : tensor<16x128x2x2xf32>) {
+  // CHECK-NEXT:      ^bb0(%[[IN_W:.*]]: f32, %[[OUT_W:.*]]: f32):
+  // CHECK-NEXT:        %[[I0:.*]] = linalg.index 0 : index
+  // CHECK-NEXT:        %[[I1:.*]] = linalg.index 1 : index
+  // CHECK-NEXT:        %[[I2:.*]] = linalg.index 2 : index
+  // CHECK-NEXT:        %[[I3:.*]] = linalg.index 3 : index
+  // CHECK-NEXT:        %[[R2:.*]] = arith.subi %[[CST1]], %[[I2]] : index
+  // CHECK-NEXT:        %[[R3:.*]] = arith.subi %[[CST1]], %[[I3]] : index
+  // CHECK-NEXT:        %[[EX:.*]] = tensor.extract %[[T1]][%[[I0]], %[[I1]], %[[R2]], %[[R3]]] : tensor<16x128x2x2xf32>
+  // CHECK-NEXT:        linalg.yield %[[EX]] : f32
+  // CHECK-NEXT:      } -> tensor<16x128x2x2xf32>
+  // CHECK:           %[[SLICE_EMPTY:.*]] = tensor.empty() : tensor<2x16x66x66xf32>
+  // CHECK-NEXT:      %[[SLICE_FILLED:.*]] = linalg.fill ins(%cst : f32) outs(%[[SLICE_EMPTY]] : tensor<2x16x66x66xf32>) -> tensor<2x16x66x66xf32>
+  // CHECK-NEXT:      %[[SLICE:.*]]  = tensor.insert_slice %[[T0]] into %[[SLICE_FILLED]][0, 0, 0, 0] [2, 16, 33, 33] [1, 1, 2, 2] : tensor<2x16x33x33xf32> into tensor<2x16x66x66xf32>
+  // CHECK:           %[[OUT_EMPTY:.*]] = tensor.empty() : tensor<2x128x64x64xf32>
+  // CHECK:           %[[OUT_FILLED:.*]] = linalg.fill ins(%[[CST0]] : f32) outs(%[[OUT_EMPTY]] : tensor<2x128x64x64xf32>) -> tensor<2x128x64x64xf32>
+  // CHECK:           %[[CONV:.*]] = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d5 * 2 + d2, d6 * 2 + d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%[[SLICE]], %[[W_REV]] : tensor<2x16x66x66xf32>, tensor<16x128x2x2xf32>) outs(%[[OUT_FILLED]] : tensor<2x128x64x64xf32>) {
+  // CHECK-NEXT:      ^bb0(%[[IN:.*]]: f32, %[[IN1:.*]]: f32, %[[OUT:.*]]: f32):
+  // CHECK-NEXT:        %[[MUL:.*]] = arith.mulf %[[IN]], %[[IN1]] : f32
+  // CHECK-NEXT:        %[[ACC:.*]] = arith.addf %[[MUL]], %[[OUT]] : f32
+  // CHECK-NEXT:        linalg.yield %[[ACC]] : f32
+  // CHECK-NEXT:      } -> tensor<2x128x64x64xf32>
+  // CHECK:           %[[IGRAD:.*]] = torch_c.from_builtin_tensor %[[CONV]] : tensor<2x128x64x64xf32> -> !torch.vtensor<[2,128,64,64],f32>
+  // CHECK:           %[[SUM_EMPTY:.*]] = tensor.empty() : tensor<16xf32>
+  // CHECK:           %[[SUM_FILLED:.*]] = linalg.fill ins(%[[CST0]] : f32) outs(%[[SUM_EMPTY]] : tensor<16xf32>) -> tensor<16xf32>
+  // CHECK:           %[[SUM_GEN:.*]] = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1)>], iterator_types = ["reduction", "parallel", "reduction", "reduction"]} ins(%[[T0]] : tensor<2x16x33x33xf32>) outs(%[[SUM_FILLED]] : tensor<16xf32>) {
+  // CHECK-NEXT:      ^bb0(%[[IN_B:.*]]: f32, %[[ACC_B:.*]]: f32):
+  // CHECK-NEXT:        %[[B_RES:.*]] = arith.addf %[[IN_B]], %[[ACC_B]] : f32
+  // CHECK-NEXT:        linalg.yield %[[B_RES]] : f32
+  // CHECK-NEXT:      } -> tensor<16xf32>
+  // CHECK:           %[[BIAS:.*]] = torch_c.from_builtin_tensor %[[SUM_GEN]] : tensor<16xf32> -> !torch.vtensor<[16],f32>
+  // CHECK:           return %[[IGRAD]], %[[BIAS]] : !torch.vtensor<[2,128,64,64],f32>, !torch.vtensor<[16],f32>
+  %true = torch.constant.bool true
+  %int0 = torch.constant.int 0
+  %false = torch.constant.bool false
+  %int1 = torch.constant.int 1
+  %int2 = torch.constant.int 2
+  %0 = torch.prim.ListConstruct %int1 : (!torch.int) -> !torch.list<int>
+  %1 = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %2 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %true, %false, %true : (!torch.bool, !torch.bool, !torch.bool) -> !torch.list<bool>
+  %result0, %result1, %result2 = torch.aten.convolution_backward %arg0, %arg1, %arg2, %0, %1, %1, %1, %false, %2, %int1, %3 : !torch.vtensor<[2,16,33,33],f32>, !torch.vtensor<[2,128,64,64],f32>, !torch.vtensor<[16,128,2,2],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int, !torch.list<bool> -> !torch.vtensor<[2,128,64,64],f32>, !torch.none, !torch.vtensor<[16],f32>
+  return %result0, %result2 : !torch.vtensor<[2,128,64,64],f32>, !torch.vtensor<[16],f32>
+}
+
+// -----
+
 // CHECK-LABEL:   func.func @convolution_backward_weights_1x1s_0x0p_1x1d_1g(
 // CHECK-SAME:                                                %[[VAL_0:.*]]: !torch.vtensor<[2,16,63,63],f32>, %[[VAL_1:.*]]: !torch.vtensor<[2,128,64,64],f32>,
 // CHECK-SAME:                                                %[[VAL_2:.*]]: !torch.vtensor<[16,128,2,2],f32>,