Re-land upstream PR#146667: [flang][OpenMP] Allocate reduction init temps on the stack f or GPUs (llvm#3892)

ronlieb · web-flow · commit 1201af18998b · 2025-09-05T13:14:01.000-04:00
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -503,31 +503,37 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
 
   // TODO: Allocate on the heap if the whole reduction/privatization is nested
   // inside of a loop
-  mlir::Value tempValue;
-  std::optional<int64_t> cstNeedsDealloc;
-  if (isAllocatableOrPointer) {
-    auto [heapTemp, needsDealloc] = createTempFromMold(loc, builder, source);
-    tempValue = heapTemp;
-    cstNeedsDealloc = fir::getIntIfConstant(needsDealloc);
-  } else {
-    tempValue = hlfir::createStackTempFromMold(loc, builder, source);
-    cstNeedsDealloc = false;
-  }
-  hlfir::Entity temp{tempValue};
-
-  // if needsDealloc isn't statically false, add cleanup region. Always
-  // do this for allocatable boxes because they might have been re-allocated
-  // in the body of the loop/parallel region
-  assert(cstNeedsDealloc.has_value() &&
-         "createTempFromMold decides this statically");
-  if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
-    mlir::OpBuilder::InsertionGuard guard(builder);
-    createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
-                        isDoConcurrent);
-  } else {
-    assert(!isAllocatableOrPointer &&
-           "Pointer-like arrays must be heap allocated");
-  }
+  auto temp = [&]() {
+    bool shouldAllocateOnStack = false;
+
+    // On the GPU, always allocate on the stack since heap allocatins are very
+    // expensive.
+    if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
+            *builder.getModule()))
+      shouldAllocateOnStack = offloadMod.getIsGPU();
+
+    if (shouldAllocateOnStack)
+      return createStackTempFromMold(loc, builder, source);
+
+    auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
+
+    // if needsDealloc isn't statically false, add cleanup region. Always
+    // do this for allocatable boxes because they might have been re-allocated
+    // in the body of the loop/parallel region
+    std::optional<int64_t> cstNeedsDealloc =
+        fir::getIntIfConstant(needsDealloc);
+    assert(cstNeedsDealloc.has_value() &&
+           "createTempFromMold decides this statically");
+    if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
+                          isDoConcurrent);
+    } else {
+      assert(!isAllocatableOrPointer &&
+             "Pointer-like arrays must be heap allocated");
+    }
+    return temp;
+  }();
 
   // Put the temporary inside of a box:
   // hlfir::genVariableBox doesn't handle non-default lower bounds
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
@@ -1,4 +1,3 @@
-! XFAIL: *
 ! Tests delayed privatization for `targets ... private(..)` for allocatables.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-array.f90
@@ -1,5 +1,4 @@
 ! Test delayed privatization for arrays.
-! XFAIL: *
 
 ! RUN: split-file %s %t
 
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
 
 program reduce
 integer, dimension(2:4, 2) :: i = 0
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
@@ -1,6 +1,8 @@
-! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 | FileCheck %s --check-prefix=GPU
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp -fopenmp-is-target-device -o - %s 2>&1 | FileCheck %s --check-prefix=GPU
 
 program reduce
 integer, dimension(3) :: i = 0
@@ -14,81 +16,88 @@ program reduce
 print *,i
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
-! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
-! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK-LABEL:   } init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[ALLOC:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""}
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<3xi32>>,
-! CHECK:           %[[TRUE:.*]]  = arith.constant true
+! CPU-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
+! CPU:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CPU:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU-LABEL:   } init {
+! CPU:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[ALLOC:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CPU:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CPU:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[VAL_4:.*]] = arith.constant 3 : index
+! CPU:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CPU:           %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""}
+! CPU:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<3xi32>>,
+! CPU:           %[[TRUE:.*]]  = arith.constant true
 !fir.shape<1>) -> (!fir.heap<!fir.array<3xi32>>, !fir.heap<!fir.array<3xi32>>)
-! CHECK:           %[[C0:.*]] = arith.constant 0 : index
-! CHECK:           %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
-! CHECK:           %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap<!fir.array<3xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<3xi32>>
-! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
-! CHECK:           fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           omp.yield(%[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK:         } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
-! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[C1:.*]] = arith.constant 1 : index
-! CHECK:           %[[C3:.*]] = arith.constant 3 : index
-! CHECK:           %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[C1_0:.*]] = arith.constant 1 : index
-! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered {
-! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
-! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
-! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
-! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
-! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
-! CHECK:           }
-! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK:         }  cleanup {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
-! CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<3xi32>>) -> !fir.ref<!fir.array<3xi32>>
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> i64
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
-! CHECK:           %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
-! CHECK:           fir.if %[[VAL_5]] {
-! CHECK:             %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> !fir.heap<!fir.array<3xi32>>
-! CHECK:             fir.freemem %[[VAL_6]] : !fir.heap<!fir.array<3xi32>>
-! CHECK:           }
-! CHECK:           omp.yield
-! CHECK:         }
+! CPU:           %[[C0:.*]] = arith.constant 0 : index
+! CPU:           %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
+! CPU:           %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
+! CPU:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap<!fir.array<3xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<3xi32>>
+! CPU:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
+! CPU:           fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           omp.yield(%[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU:         } combiner {
+! CPU:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CPU:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[C1:.*]] = arith.constant 1 : index
+! CPU:           %[[C3:.*]] = arith.constant 3 : index
+! CPU:           %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1>
+! CPU:           %[[C1_0:.*]] = arith.constant 1 : index
+! CPU:           fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered {
+! CPU:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CPU:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CPU:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CPU:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CPU:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CPU:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CPU:           }
+! CPU:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU:         }  cleanup {
+! CPU:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CPU:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<3xi32>>) -> !fir.ref<!fir.array<3xi32>>
+! CPU:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> i64
+! CPU:           %[[VAL_4:.*]] = arith.constant 0 : i64
+! CPU:           %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
+! CPU:           fir.if %[[VAL_5]] {
+! CPU:             %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> !fir.heap<!fir.array<3xi32>>
+! CPU:             fir.freemem %[[VAL_6]] : !fir.heap<!fir.array<3xi32>>
+! CPU:           }
+! CPU:           omp.yield
+! CPU:         }
+
+! CPU-LABEL:   func.func @_QQmain()
+! CPU:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
+! CPU:           %[[VAL_1:.*]] = arith.constant 3 : index
+! CPU:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+! CPU:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CPU:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CPU:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CPU:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
+! CPU:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CPU:             %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:             %[[VAL_10:.*]] = arith.constant 1 : index
+! CPU:             %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CPU:             hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref<i32>
+! CPU:             %[[VAL_12:.*]] = arith.constant 2 : i32
+! CPU:             %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:             %[[VAL_14:.*]] = arith.constant 2 : index
+! CPU:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CPU:             hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref<i32>
+! CPU:             %[[VAL_16:.*]] = arith.constant 3 : i32
+! CPU:             %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:             %[[VAL_18:.*]] = arith.constant 3 : index
+! CPU:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CPU:             hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
+! CPU:             omp.terminator
+! CPU:           }
 
-! CHECK-LABEL:   func.func @_QQmain()
-! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
-! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
-! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
-! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
-! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : index
-! CHECK:             %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref<i32>
-! CHECK:             %[[VAL_12:.*]] = arith.constant 2 : i32
-! CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:             %[[VAL_14:.*]] = arith.constant 2 : index
-! CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref<i32>
-! CHECK:             %[[VAL_16:.*]] = arith.constant 3 : i32
-! CHECK:             %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:             %[[VAL_18:.*]] = arith.constant 3 : index
-! CHECK:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
-! CHECK:             omp.terminator
-! CHECK:           }
+! GPU: omp.declare_reduction {{.*}} alloc {
+! GPU: } init {
+! GPU-NOT: fir.allocmem {{.*}} {bindc_name = ".tmp", {{.*}}}
+! GPU:     fir.alloca {{.*}} {bindc_name = ".tmp"}
+! GPU: } combiner {
+! GPU: }
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
 
 program reduce
 integer, dimension(3) :: i = 0
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
 
 ! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
 
 subroutine max_array_reduction(l, r)
   integer :: l(:), r(:)
diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
 
 ! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc {
 !                 [...]
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
-! XFAIL: *
 
 program reduce_assumed_shape
 real(8), dimension(2) :: r
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
 program reduce
 integer :: i = 0
 integer, dimension(2) :: r = 0
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! XFAIL: *
 program reduce
 integer :: i = 0
 integer, dimension(2) :: r = 0
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -1,6 +1,5 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
-! XFAIL: *
 
 program main
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -14,9 +14,9 @@ end subroutine local_assoc
 
 ! CHECK: fir.local {type = local} @[[LOCALIZER:.*local_assocEa.*]] : !fir.box<!fir.array<8xf32>> init {
 ! CHECK-NEXT: ^{{.*}}(%{{.*}}: !{{.*}}, %[[LOCAL_ARG:.*]]: !{{.*}}):
-! CHECK-NEXT:   %[[TMP_ALLOC:.*]] = fir.alloca !{{.*}} {bindc_name = ".tmp"}
 ! CHECK-NEXT:   %[[C8:.*]] = arith.constant 8 : index
 ! CHECK-NEXT:   %[[SHAPE:.*]] = fir.shape %[[C8]]
+! CHECK-NEXT:   %[[TMP_ALLOC:.*]] = fir.allocmem !{{.*}} {bindc_name = ".tmp", {{.*}}}
 ! CHECK:        %[[TMP_DECL:.*]]:2 = hlfir.declare %[[TMP_ALLOC]](%[[SHAPE]])
 ! CHECK:        %[[C1:.*]] = arith.constant 1 : index
 ! CHECK-NEXT:   %[[C8:.*]] = arith.constant 8 : index
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-! XFAIL: *`
`2`	`1`	! Tests delayed privatization for `targets ... private(..)` for allocatables.
`3`	`2`
`4`	`3`	`! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`! Test delayed privatization for arrays.`
`2`		`-! XFAIL: *`
`3`	`2`
`4`	`3`	`! RUN: split-file %s %t`
`5`	`4`