From e2ac3619c68851a67668ade6769451cff90df69b Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Sun, 4 Aug 2024 22:06:55 +0900 Subject: [PATCH] [flang] Lower omp.workshare to other omp constructs Change to workshare loop wrapper op Move single op declaration Schedule pass properly Correctly handle nested nested loop nests to be parallelized by workshare Leave comments for shouldUseWorkshareLowering Use copyprivate to scatter val from omp.single TODO still need to implement copy function TODO transitive check for usage outside of omp.single not imiplemented yet Transitively check for users outisde of single op TODO need to implement copy func TODO need to hoist allocas outside of single regions Add tests Hoist allocas More tests Emit body for copy func Test the tmp storing logic Clean up trivially dead ops Only handle single-block regions for now Fix tests for custom assembly for loop wrapper Only run the lower workshare pass if openmp is enabled Implement some missing functionality Fix tests Fix test Iterate backwards to find all trivially dead ops Add expalanation comment for createCopyFun Update test Emit a proper error message for CFG in workshare Cleanup tests Fix todo tests Fix dst src in copy function Use omp.single to handle CFG cases Fix lower workshare tests Different warning Fix bug and add better clarification comments Fix message Fix tests Do not emit empty omp.single's LowerWorkshare tests pipelines fix --- flang/include/flang/Optimizer/OpenMP/Passes.h | 5 + .../include/flang/Optimizer/OpenMP/Passes.td | 5 + .../flang/Optimizer/Passes/Pipelines.h | 3 +- flang/include/flang/Tools/CrossToolHelpers.h | 1 + flang/lib/Frontend/FrontendActions.cpp | 10 +- flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 + flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 527 ++++++++++++++++++ flang/lib/Optimizer/Passes/Pipelines.cpp | 6 +- flang/test/Fir/basic-program.fir | 1 + .../OpenMP/lower-workshare-alloca.mlir | 53 ++ .../OpenMP/lower-workshare-binding.mlir | 49 ++ .../OpenMP/lower-workshare-cleanup.mlir | 57 ++ .../OpenMP/lower-workshare-copyprivate.mlir | 73 +++ .../lower-workshare-correct-parallelize.mlir | 25 + .../OpenMP/lower-workshare-no-single.mlir | 19 + .../OpenMP/lower-workshare-nowait.mlir | 23 + .../OpenMP/lower-workshare-todo-cfg-dom.mlir | 26 + .../OpenMP/lower-workshare-todo-cfg.mlir | 23 + flang/tools/bbc/bbc.cpp | 5 +- flang/tools/tco/tco.cpp | 1 + 20 files changed, 908 insertions(+), 5 deletions(-) create mode 100644 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-binding.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h index 403d79667bf44..feb395f1a12db 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.h +++ b/flang/include/flang/Optimizer/OpenMP/Passes.h @@ -25,6 +25,11 @@ namespace flangomp { #define GEN_PASS_REGISTRATION #include "flang/Optimizer/OpenMP/Passes.h.inc" +/// Impelements the logic specified in the 2.8.3 workshare Construct section of +/// the OpenMP standard which specifies what statements or constructs shall be +/// divided into units of work. +bool shouldUseWorkshareLowering(mlir::Operation *op); + } // namespace flangomp #endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index c070bc22ff20c..37977334c1e9e 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -50,4 +50,9 @@ def FunctionFilteringPass : Pass<"omp-function-filtering"> { ]; } +// Needs to be scheduled on Module as we create functions in it +def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> { + let summary = "Lower workshare construct"; +} + #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h index 3b54ac3883858..55fafc2e6b36f 100644 --- a/flang/include/flang/Optimizer/Passes/Pipelines.h +++ b/flang/include/flang/Optimizer/Passes/Pipelines.h @@ -123,7 +123,8 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm, /// \param optLevel - optimization level used for creating FIR optimization /// passes pipeline void createHLFIRToFIRPassPipeline( - mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel); + mlir::PassManager &pm, bool enableOpenMP, + llvm::OptimizationLevel optLevel = defaultOptLevel); /// Create a pass pipeline for handling certain OpenMP transformations needed /// prior to FIR lowering. diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h index df4b21ada058f..d936b739e5815 100644 --- a/flang/include/flang/Tools/CrossToolHelpers.h +++ b/flang/include/flang/Tools/CrossToolHelpers.h @@ -123,6 +123,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks { false; ///< Set no-signed-zeros-fp-math attribute for functions. bool UnsafeFPMath = false; ///< Set unsafe-fp-math attribute for functions. bool NSWOnLoopVarInc = false; ///< Add nsw flag to loop variable increments. + bool EnableOpenMP = false; ///< Enable OpenMP lowering. }; struct OffloadModuleOpts { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index f2e460fc53a67..8c21fe18e67b4 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -715,7 +715,11 @@ void CodeGenAction::lowerHLFIRToFIR() { pm.enableVerifier(/*verifyPasses=*/true); // Create the pass pipeline - fir::createHLFIRToFIRPassPipeline(pm, level); + fir::createHLFIRToFIRPassPipeline( + pm, + ci.getInvocation().getFrontendOpts().features.IsEnabled( + Fortran::common::LanguageFeature::OpenMP), + level); (void)mlir::applyPassManagerCLOptions(pm); if (!mlir::succeeded(pm.run(*mlirModule))) { @@ -828,6 +832,10 @@ void CodeGenAction::generateLLVMIR() { config.VScaleMax = vsr->second; } + if (ci.getInvocation().getFrontendOpts().features.IsEnabled( + Fortran::common::LanguageFeature::OpenMP)) + config.EnableOpenMP = true; + if (ci.getInvocation().getLoweringOpts().getNSWOnLoopVarInc()) config.NSWOnLoopVarInc = true; diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 035d0d5ca46c7..b1e0dbf6e707e 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FlangOpenMPTransforms MapsForPrivatizedSymbols.cpp MapInfoFinalization.cpp MarkDeclareTarget.cpp + LowerWorkshare.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp new file mode 100644 index 0000000000000..225c585a02d91 --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp @@ -0,0 +1,527 @@ +//===- LowerWorkshare.cpp - special cases for bufferization -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the lowering of omp.workshare to other omp constructs. +// +// This pass is tasked with parallelizing the loops nested in +// workshare.loop_wrapper while both the Fortran to mlir lowering and the hlfir +// to fir lowering pipelines are responsible for emitting the +// workshare.loop_wrapper ops where appropriate according to the +// `shouldUseWorkshareLowering` function. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace flangomp { +#define GEN_PASS_DEF_LOWERWORKSHARE +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp + +#define DEBUG_TYPE "lower-workshare" + +using namespace mlir; + +namespace flangomp { + +// Checks for nesting pattern below as we need to avoid sharing the work of +// statements which are nested in some constructs such as omp.critical or +// another omp.parallel. +// +// omp.workshare { // `wsOp` +// ... +// omp.T { // `parent` +// ... +// `op` +// +template +static bool isNestedIn(omp::WorkshareOp wsOp, Operation *op) { + T parent = op->getParentOfType(); + if (!parent) + return false; + return wsOp->isProperAncestor(parent); +} + +bool shouldUseWorkshareLowering(Operation *op) { + auto parentWorkshare = op->getParentOfType(); + + if (!parentWorkshare) + return false; + + if (isNestedIn(parentWorkshare, op)) + return false; + + // 2.8.3 workshare Construct + // For a parallel construct, the construct is a unit of work with respect to + // the workshare construct. The statements contained in the parallel construct + // are executed by a new thread team. + if (isNestedIn(parentWorkshare, op)) + return false; + + // 2.8.2 single Construct + // Binding The binding thread set for a single region is the current team. A + // single region binds to the innermost enclosing parallel region. + // Description Only one of the encountering threads will execute the + // structured block associated with the single construct. + if (isNestedIn(parentWorkshare, op)) + return false; + + // Do not use workshare lowering until we support CFG in omp.workshare + if (parentWorkshare.getRegion().getBlocks().size() != 1) + return false; + + return true; +} + +} // namespace flangomp + +namespace { + +struct SingleRegion { + Block::iterator begin, end; +}; + +static bool mustParallelizeOp(Operation *op) { + return op + ->walk([&](Operation *nested) { + // We need to be careful not to pick up workshare.loop_wrapper in nested + // omp.parallel{omp.workshare} regions, i.e. make sure that `nested` + // binds to the workshare region we are currently handling. + // + // For example: + // + // omp.parallel { + // omp.workshare { // currently handling this + // omp.parallel { + // omp.workshare { // nested workshare + // omp.workshare.loop_wrapper {} + // + // Therefore, we skip if we encounter a nested omp.workshare. + if (isa(op)) + return WalkResult::skip(); + if (isa(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }) + .wasInterrupted(); +} + +static bool isSafeToParallelize(Operation *op) { + return isa(op) || isa(op) || + isMemoryEffectFree(op); +} + +/// Simple shallow copies suffice for our purposes in this pass, so we implement +/// this simpler alternative to the full fledged `createCopyFunc` in the +/// frontend +static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType, + fir::FirOpBuilder builder) { + mlir::ModuleOp module = builder.getModule(); + auto rt = cast(varType); + mlir::Type eleTy = rt.getEleTy(); + std::string copyFuncName = + fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy"); + + if (auto decl = module.lookupSymbol(copyFuncName)) + return decl; + // create function + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::OpBuilder modBuilder(module.getBodyRegion()); + llvm::SmallVector argsTy = {varType, varType}; + auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {}); + mlir::func::FuncOp funcOp = + modBuilder.create(loc, copyFuncName, funcType); + funcOp.setVisibility(mlir::SymbolTable::Visibility::Private); + builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy, + {loc, loc}); + builder.setInsertionPointToStart(&funcOp.getRegion().back()); + + Value loaded = builder.create(loc, funcOp.getArgument(1)); + builder.create(loc, loaded, funcOp.getArgument(0)); + + builder.create(loc); + return funcOp; +} + +static bool isUserOutsideSR(Operation *user, Operation *parentOp, + SingleRegion sr) { + while (user->getParentOp() != parentOp) + user = user->getParentOp(); + return sr.begin->getBlock() != user->getBlock() || + !(user->isBeforeInBlock(&*sr.end) && sr.begin->isBeforeInBlock(user)); +} + +static bool isTransitivelyUsedOutside(Value v, SingleRegion sr) { + Block *srBlock = sr.begin->getBlock(); + Operation *parentOp = srBlock->getParentOp(); + + for (auto &use : v.getUses()) { + Operation *user = use.getOwner(); + if (isUserOutsideSR(user, parentOp, sr)) + return true; + + // Now we know user is inside `sr`. + + // Results of nested users cannot be used outside of `sr`. + if (user->getBlock() != srBlock) + continue; + + // A non-safe to parallelize operation will be checked for uses outside + // separately. + if (!isSafeToParallelize(user)) + continue; + + // For safe to parallelize operations, we need to check if there is a + // transitive use of `v` through them. + for (auto res : user->getResults()) + if (isTransitivelyUsedOutside(res, sr)) + return true; + } + return false; +} + +/// We clone pure operations in both the parallel and single blocks. this +/// functions cleans them up if they end up with no uses +static void cleanupBlock(Block *block) { + for (Operation &op : llvm::make_early_inc_range( + llvm::make_range(block->rbegin(), block->rend()))) + if (isOpTriviallyDead(&op)) + op.erase(); +} + +static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, + IRMapping &rootMapping, Location loc, + mlir::DominanceInfo &di) { + OpBuilder rootBuilder(sourceRegion.getContext()); + ModuleOp m = sourceRegion.getParentOfType(); + OpBuilder copyFuncBuilder(m.getBodyRegion()); + fir::FirOpBuilder firCopyFuncBuilder(copyFuncBuilder, m); + + auto mapReloadedValue = + [&](Value v, OpBuilder allocaBuilder, OpBuilder singleBuilder, + OpBuilder parallelBuilder, IRMapping singleMapping) -> Value { + if (auto reloaded = rootMapping.lookupOrNull(v)) + return nullptr; + Type ty = v.getType(); + Value alloc = allocaBuilder.create(loc, ty); + singleBuilder.create(loc, singleMapping.lookup(v), alloc); + Value reloaded = parallelBuilder.create(loc, ty, alloc); + rootMapping.map(v, reloaded); + return alloc; + }; + + auto moveToSingle = + [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder, + OpBuilder parallelBuilder) -> std::pair> { + IRMapping singleMapping = rootMapping; + SmallVector copyPrivate; + bool allParallelized = true; + + for (Operation &op : llvm::make_range(sr.begin, sr.end)) { + if (isSafeToParallelize(&op)) { + singleBuilder.clone(op, singleMapping); + if (llvm::all_of(op.getOperands(), [&](Value opr) { + // Either we have already remapped it + bool remapped = rootMapping.contains(opr); + // Or it is available because it dominates `sr` + bool dominates = + di.properlyDominates(opr.getDefiningOp(), &*sr.begin); + return remapped || dominates; + })) { + // Safe to parallelize operations which have all operands available in + // the root parallel block can be executed there. + parallelBuilder.clone(op, rootMapping); + } else { + // If any operand was not available, it means that there was no + // transitive use of a non-safe-to-parallelize operation outside `sr`. + // This means that there should be no transitive uses outside `sr` of + // `op`. + assert(llvm::all_of(op.getResults(), [&](Value v) { + return !isTransitivelyUsedOutside(v, sr); + })); + allParallelized = false; + } + } else if (auto alloca = dyn_cast(&op)) { + auto hoisted = + cast(allocaBuilder.clone(*alloca, singleMapping)); + rootMapping.map(&*alloca, &*hoisted); + rootMapping.map(alloca.getResult(), hoisted.getResult()); + copyPrivate.push_back(hoisted); + allParallelized = false; + } else { + singleBuilder.clone(op, singleMapping); + // Prepare reloaded values for results of operations that cannot be + // safely parallelized and which are used after the region `sr`. + for (auto res : op.getResults()) { + if (isTransitivelyUsedOutside(res, sr)) { + auto alloc = mapReloadedValue(res, allocaBuilder, singleBuilder, + parallelBuilder, singleMapping); + if (alloc) + copyPrivate.push_back(alloc); + } + } + allParallelized = false; + } + } + singleBuilder.create(loc); + return {allParallelized, copyPrivate}; + }; + + for (Block &block : sourceRegion) { + Block *targetBlock = rootBuilder.createBlock( + &targetRegion, {}, block.getArgumentTypes(), + llvm::map_to_vector(block.getArguments(), + [](BlockArgument arg) { return arg.getLoc(); })); + rootMapping.map(&block, targetBlock); + rootMapping.map(block.getArguments(), targetBlock->getArguments()); + } + + auto handleOneBlock = [&](Block &block) { + Block &targetBlock = *rootMapping.lookup(&block); + rootBuilder.setInsertionPointToStart(&targetBlock); + Operation *terminator = block.getTerminator(); + SmallVector> regions; + + auto it = block.begin(); + auto getOneRegion = [&]() { + if (&*it == terminator) + return false; + if (mustParallelizeOp(&*it)) { + regions.push_back(&*it); + it++; + return true; + } + SingleRegion sr; + sr.begin = it; + while (&*it != terminator && !mustParallelizeOp(&*it)) + it++; + sr.end = it; + assert(sr.begin != sr.end); + regions.push_back(sr); + return true; + }; + while (getOneRegion()) + ; + + for (auto [i, opOrSingle] : llvm::enumerate(regions)) { + bool isLast = i + 1 == regions.size(); + if (std::holds_alternative(opOrSingle)) { + OpBuilder singleBuilder(sourceRegion.getContext()); + Block *singleBlock = new Block(); + singleBuilder.setInsertionPointToStart(singleBlock); + + OpBuilder allocaBuilder(sourceRegion.getContext()); + Block *allocaBlock = new Block(); + allocaBuilder.setInsertionPointToStart(allocaBlock); + + OpBuilder parallelBuilder(sourceRegion.getContext()); + Block *parallelBlock = new Block(); + parallelBuilder.setInsertionPointToStart(parallelBlock); + + auto [allParallelized, copyprivateVars] = + moveToSingle(std::get(opOrSingle), allocaBuilder, + singleBuilder, parallelBuilder); + if (allParallelized) { + // The single region was not required as all operations were safe to + // parallelize + assert(copyprivateVars.empty()); + assert(allocaBlock->empty()); + delete singleBlock; + } else { + omp::SingleOperands singleOperands; + if (isLast) + singleOperands.nowait = rootBuilder.getUnitAttr(); + singleOperands.copyprivateVars = copyprivateVars; + cleanupBlock(singleBlock); + for (auto var : singleOperands.copyprivateVars) { + mlir::func::FuncOp funcOp = + createCopyFunc(loc, var.getType(), firCopyFuncBuilder); + singleOperands.copyprivateSyms.push_back( + SymbolRefAttr::get(funcOp)); + } + omp::SingleOp singleOp = + rootBuilder.create(loc, singleOperands); + singleOp.getRegion().push_back(singleBlock); + targetRegion.front().getOperations().splice( + singleOp->getIterator(), allocaBlock->getOperations()); + } + rootBuilder.getInsertionBlock()->getOperations().splice( + rootBuilder.getInsertionPoint(), parallelBlock->getOperations()); + delete allocaBlock; + delete parallelBlock; + } else { + auto op = std::get(opOrSingle); + if (auto wslw = dyn_cast(op)) { + omp::WsloopOperands wsloopOperands; + if (isLast) + wsloopOperands.nowait = rootBuilder.getUnitAttr(); + auto wsloop = + rootBuilder.create(loc, wsloopOperands); + auto clonedWslw = cast( + rootBuilder.clone(*wslw, rootMapping)); + wsloop.getRegion().takeBody(clonedWslw.getRegion()); + clonedWslw->erase(); + } else { + assert(mustParallelizeOp(op)); + Operation *cloned = rootBuilder.cloneWithoutRegions(*op, rootMapping); + for (auto [region, clonedRegion] : + llvm::zip(op->getRegions(), cloned->getRegions())) + parallelizeRegion(region, clonedRegion, rootMapping, loc, di); + } + } + } + + rootBuilder.clone(*block.getTerminator(), rootMapping); + }; + + if (sourceRegion.hasOneBlock()) { + handleOneBlock(sourceRegion.front()); + } else { + auto &domTree = di.getDomTree(&sourceRegion); + for (auto node : llvm::breadth_first(domTree.getRootNode())) { + handleOneBlock(*node->getBlock()); + } + } + + for (Block &targetBlock : targetRegion) + cleanupBlock(&targetBlock); +} + +/// Lowers workshare to a sequence of single-thread regions and parallel loops +/// +/// For example: +/// +/// omp.workshare { +/// %a = fir.allocmem +/// omp.workshare.loop_wrapper {} +/// fir.call Assign %b %a +/// fir.freemem %a +/// } +/// +/// becomes +/// +/// %tmp = fir.alloca +/// omp.single copyprivate(%tmp) { +/// %a = fir.allocmem +/// fir.store %a %tmp +/// } +/// %a_reloaded = fir.load %tmp +/// omp.workshare.loop_wrapper {} +/// omp.single { +/// fir.call Assign %b %a_reloaded +/// fir.freemem %a_reloaded +/// } +/// +/// Note that we allocate temporary memory for values in omp.single's which need +/// to be accessed by all threads and broadcast them using single's copyprivate +LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) { + Location loc = wsOp->getLoc(); + IRMapping rootMapping; + + OpBuilder rootBuilder(wsOp); + + // FIXME Currently, we only support workshare constructs with structured + // control flow. The transformation itself supports CFG, however, once we + // transform the MLIR region in the omp.workshare, we need to inline that + // region in the parent block. We have no guarantees at this point of the + // pipeline that the parent op supports CFG (e.g. fir.if), thus this is not + // generally possible. The alternative is to put the lowered region in an + // operation akin to scf.execute_region, which will get lowered at the same + // time when fir ops get lowered to CFG. However, SCF is not registered in + // flang so we cannot use it. Remove this requirement once we have + // scf.execute_region or an alternative operation available. + if (wsOp.getRegion().getBlocks().size() == 1) { + // This operation is just a placeholder which will be erased later. We need + // it because our `parallelizeRegion` function works on regions and not + // blocks. + omp::WorkshareOp newOp = + rootBuilder.create(loc, omp::WorkshareOperands()); + if (!wsOp.getNowait()) + rootBuilder.create(loc); + + parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc, + di); + + // Inline the contents of the placeholder workshare op into its parent + // block. + Block *theBlock = &newOp.getRegion().front(); + Operation *term = theBlock->getTerminator(); + Block *parentBlock = wsOp->getBlock(); + parentBlock->getOperations().splice(newOp->getIterator(), + theBlock->getOperations()); + assert(term->getNumOperands() == 0); + term->erase(); + newOp->erase(); + wsOp->erase(); + } else { + // Otherwise just change the operation to an omp.single. + + wsOp->emitWarning( + "omp workshare with unstructured control flow is currently " + "unsupported and will be serialized."); + + // `shouldUseWorkshareLowering` should have guaranteed that there are no + // omp.workshare_loop_wrapper's that bind to this omp.workshare. + assert(!wsOp->walk([&](Operation *op) { + // Nested omp.workshare can have their own + // omp.workshare_loop_wrapper's. + if (isa(op)) + return WalkResult::skip(); + if (isa(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }) + .wasInterrupted()); + + omp::SingleOperands operands; + operands.nowait = wsOp.getNowaitAttr(); + omp::SingleOp newOp = rootBuilder.create(loc, operands); + + newOp.getRegion().getBlocks().splice(newOp.getRegion().getBlocks().begin(), + wsOp.getRegion().getBlocks()); + wsOp->erase(); + } + return success(); +} + +class LowerWorksharePass + : public flangomp::impl::LowerWorkshareBase { +public: + void runOnOperation() override { + mlir::DominanceInfo &di = getAnalysis(); + getOperation()->walk([&](mlir::omp::WorkshareOp wsOp) { + if (failed(lowerWorkshare(wsOp, di))) + signalPassFailure(); + }); + } +}; +} // namespace diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index a914407991591..31af3531641dd 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -212,7 +212,7 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm, /// \param pm - MLIR pass manager that will hold the pipeline definition /// \param optLevel - optimization level used for creating FIR optimization /// passes pipeline -void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, +void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, llvm::OptimizationLevel optLevel) { if (optLevel.isOptimizingForSpeed()) { addCanonicalizerPassWithoutRegionSimplification(pm); @@ -230,6 +230,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, pm.addPass(hlfir::createLowerHLFIRIntrinsics()); pm.addPass(hlfir::createBufferizeHLFIR()); pm.addPass(hlfir::createConvertHLFIRtoFIR()); + if (enableOpenMP) + pm.addPass(flangomp::createLowerWorkshare()); } /// Create a pass pipeline for handling certain OpenMP transformations needed @@ -303,7 +305,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm, void createMLIRToLLVMPassPipeline(mlir::PassManager &pm, MLIRToLLVMPassPipelineConfig &config, llvm::StringRef inputFilename) { - fir::createHLFIRToFIRPassPipeline(pm, config.OptLevel); + fir::createHLFIRToFIRPassPipeline(pm, config.EnableOpenMP, config.OptLevel); // Add default optimizer pass pipeline. fir::createDefaultFIROptimizerPassPipeline(pm, config); diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index bca454c13ff9c..4b18acb7c2b43 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -47,6 +47,7 @@ func.func @_QQmain() { // PASSES-NEXT: LowerHLFIRIntrinsics // PASSES-NEXT: BufferizeHLFIR // PASSES-NEXT: ConvertHLFIRtoFIR +// PASSES-NEXT: LowerWorkshare // PASSES-NEXT: CSE // PASSES-NEXT: (S) 0 num-cse'd - Number of operations CSE'd // PASSES-NEXT: (S) 0 num-dce'd - Number of operations DCE'd diff --git a/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir new file mode 100644 index 0000000000000..12b0558d06ed5 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir @@ -0,0 +1,53 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Checks that fir.alloca is hoisted out and copyprivate'd +func.func @wsfunc() { + omp.workshare { + %c1 = arith.constant 1 : index + %c42 = arith.constant 42 : index + %c1_i32 = arith.constant 1 : i32 + %alloc = fir.alloca i32 + fir.store %c1_i32 to %alloc : !fir.ref + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test1"(%alloc) : (!fir.ref) -> () + omp.yield + } + } + "test.test2"(%alloc) : (!fir.ref) -> () + omp.terminator + } + return +} + +// CHECK-LABEL: func.func private @_workshare_copy_i32( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref) { +// CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]] : !fir.ref +// CHECK: fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref +// CHECK: return +// CHECK: } + +// CHECK-LABEL: func.func @wsfunc() { +// CHECK: %[[VAL_0:.*]] = fir.alloca i32 +// CHECK: omp.single copyprivate(%[[VAL_0]] -> @_workshare_copy_i32 : !fir.ref) { +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : i32 +// CHECK: fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 42 : index +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_4:.*]]) : index = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_2]]) { +// CHECK: "test.test1"(%[[VAL_0]]) : (!fir.ref) -> () +// CHECK: omp.yield +// CHECK: } +// CHECK: } +// CHECK: omp.single nowait { +// CHECK: "test.test2"(%[[VAL_0]]) : (!fir.ref) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.barrier +// CHECK: return +// CHECK: } + diff --git a/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir b/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir new file mode 100644 index 0000000000000..f1d0e8e229614 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir @@ -0,0 +1,49 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Checks that the omp.workshare.loop_wrapper binds to the correct omp.workshare + +func.func @wsfunc() { + %c1 = arith.constant 1 : index + %c42 = arith.constant 42 : index + omp.parallel { + omp.workshare nowait { + omp.parallel { + omp.workshare nowait { + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test2"() : () -> () + omp.yield + } + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} + +// CHECK-LABEL: func.func @wsfunc() { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 42 : index +// CHECK: omp.parallel { +// CHECK: omp.single nowait { +// CHECK: omp.parallel { +// CHECK: omp.wsloop nowait { +// CHECK: omp.loop_nest (%[[VAL_2:.*]]) : index = (%[[VAL_0]]) to (%[[VAL_1]]) inclusive step (%[[VAL_0]]) { +// CHECK: "test.test2"() : () -> () +// CHECK: omp.yield +// CHECK: } +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } + diff --git a/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir b/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir new file mode 100644 index 0000000000000..ca288917a3ac4 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir @@ -0,0 +1,57 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that we cleanup unused pure operations from the parallel and single +// regions + +// CHECK-LABEL: func.func @wsfunc() { +// CHECK: %[[VAL_0:.*]] = fir.alloca i32 +// CHECK: omp.parallel { +// CHECK: omp.single { +// CHECK: %[[VAL_1:.*]] = "test.test1"() : () -> i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 3 : index +// CHECK: %[[VAL_4:.*]] = arith.addi %[[VAL_2]], %[[VAL_3]] : index +// CHECK: "test.test3"(%[[VAL_4]]) : (index) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 42 : index +// CHECK: omp.wsloop nowait { +// CHECK: omp.loop_nest (%[[VAL_7:.*]]) : index = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_5]]) { +// CHECK: "test.test2"() : () -> () +// CHECK: omp.yield +// CHECK: } +// CHECK: } +// CHECK: omp.barrier +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } +func.func @wsfunc() { + %a = fir.alloca i32 + omp.parallel { + omp.workshare { + %t1 = "test.test1"() : () -> i32 + + %c1 = arith.constant 1 : index + %c42 = arith.constant 42 : index + + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %add = arith.addi %c2, %c3 : index + "test.test3"(%add) : (index) -> () + + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test2"() : () -> () + omp.yield + } + } + omp.terminator + } + omp.terminator + } + return +} + + diff --git a/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir b/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir new file mode 100644 index 0000000000000..d7a04e198ceed --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir @@ -0,0 +1,73 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + + +// Check if we store the correct values + +func.func @wsfunc() { + omp.parallel { + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK-NOT: fir.alloca + omp.workshare { + + %t1 = "test.test1"() : () -> i32 + // CHECK: %[[T1:.*]] = "test.test1" + // CHECK: fir.store %[[T1]] + %t2 = "test.test2"() : () -> i32 + // CHECK: %[[T2:.*]] = "test.test2" + // CHECK: fir.store %[[T2]] + %t3 = "test.test3"() : () -> i32 + // CHECK: %[[T3:.*]] = "test.test3" + // CHECK-NOT: fir.store %[[T3]] + %t4 = "test.test4"() : () -> i32 + // CHECK: %[[T4:.*]] = "test.test4" + // CHECK: fir.store %[[T4]] + %t5 = "test.test5"() : () -> i32 + // CHECK: %[[T5:.*]] = "test.test5" + // CHECK: fir.store %[[T5]] + %t6 = "test.test6"() : () -> i32 + // CHECK: %[[T6:.*]] = "test.test6" + // CHECK-NOT: fir.store %[[T6]] + + + "test.test1"(%t1) : (i32) -> () + "test.test1"(%t2) : (i32) -> () + "test.test1"(%t3) : (i32) -> () + + %true = arith.constant true + fir.if %true { + "test.test2"(%t3) : (i32) -> () + } + + %c1_i32 = arith.constant 1 : i32 + + %t5_pure_use = arith.addi %t5, %c1_i32 : i32 + + %t6_mem_effect_use = "test.test8"(%t6) : (i32) -> i32 + // CHECK: %[[T6_USE:.*]] = "test.test8" + // CHECK: fir.store %[[T6_USE]] + + %c42 = arith.constant 42 : index + %c1 = arith.constant 1 : index + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test10"(%t1) : (i32) -> () + "test.test10"(%t5_pure_use) : (i32) -> () + "test.test10"(%t6_mem_effect_use) : (i32) -> () + omp.yield + } + } + + "test.test10"(%t2) : (i32) -> () + fir.if %true { + "test.test10"(%t4) : (i32) -> () + } + omp.terminator + } + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir b/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir new file mode 100644 index 0000000000000..31db8213b5f00 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir @@ -0,0 +1,25 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that the safe to parallelize `fir.declare` op will not be parallelized +// due to its operand %alloc not being reloaded outside the omp.single. + +func.func @foo() { + %c0 = arith.constant 0 : index + omp.workshare { + %alloc = fir.allocmem !fir.array, %c0 {bindc_name = ".tmp.forall", uniq_name = ""} + %shape = fir.shape %c0 : (index) -> !fir.shape<1> + %declare = fir.declare %alloc(%shape) {uniq_name = ".tmp.forall"} : (!fir.heap>, !fir.shape<1>) -> !fir.heap> + fir.freemem %alloc : !fir.heap> + omp.terminator + } + return +} + +// CHECK: omp.single nowait +// CHECK: fir.allocmem +// CHECK: fir.shape +// CHECK: fir.declare +// CHECK: fir.freemem +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.barrier diff --git a/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir b/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir new file mode 100644 index 0000000000000..1fd379a6e5eb4 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir @@ -0,0 +1,19 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that we do not emit an omp.single for the constant operation + +func.func @foo() { + omp.workshare { + %c1 = arith.constant 1 : index + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c1) inclusive step (%c1) { + "test.test0"() : () -> () + omp.yield + } + } + omp.terminator + } + return +} + +// CHECK-NOT: omp.single diff --git a/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir b/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir new file mode 100644 index 0000000000000..940662e0bdccc --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir @@ -0,0 +1,23 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that we correctly handle nowait + +// CHECK-LABEL: func.func @nonowait +func.func @nonowait(%arg0: !fir.ref>) { + // CHECK: omp.barrier + omp.workshare { + omp.terminator + } + return +} + +// ----- + +// CHECK-LABEL: func.func @nowait +func.func @nowait(%arg0: !fir.ref>) { + // CHECK-NOT: omp.barrier + omp.workshare nowait { + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir new file mode 100644 index 0000000000000..83c49cd635d08 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir @@ -0,0 +1,26 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s + +// CHECK: warning: omp workshare with unstructured control flow is currently unsupported and will be serialized. + +// CHECK: omp.parallel +// CHECK-NEXT: omp.single + +// TODO Check that the definition of %r dominates its use post-transform +func.func @wsfunc() { + %a = fir.alloca i32 + omp.parallel { + omp.workshare { + ^bb1: + %c1 = arith.constant 1 : i32 + cf.br ^bb3(%c1: i32) + ^bb2: + "test.test2"(%r) : (i32) -> () + omp.terminator + ^bb3(%arg1: i32): + %r = "test.test2"(%arg1) : (i32) -> i32 + cf.br ^bb2 + } + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir new file mode 100644 index 0000000000000..a27cf88069401 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir @@ -0,0 +1,23 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s + +// CHECK: warning: omp workshare with unstructured control flow is currently unsupported and will be serialized. + +// CHECK: omp.parallel +// CHECK-NEXT: omp.single + +// TODO Check transforming a simple CFG +func.func @wsfunc() { + %a = fir.alloca i32 + omp.parallel { + omp.workshare { + ^bb1: + %c1 = arith.constant 1 : i32 + cf.br ^bb3(%c1: i32) + ^bb3(%arg1: i32): + "test.test2"(%arg1) : (i32) -> () + omp.terminator + } + omp.terminator + } + return +} diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index fe5e36f704c76..1c24979bbcdaf 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -452,7 +452,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR( if (emitFIR && useHLFIR) { // lower HLFIR to FIR - fir::createHLFIRToFIRPassPipeline(pm, llvm::OptimizationLevel::O2); + fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, + llvm::OptimizationLevel::O2); if (mlir::failed(pm.run(mlirModule))) { llvm::errs() << "FATAL: lowering from HLFIR to FIR failed"; return mlir::failure(); @@ -467,6 +468,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR( // Add O2 optimizer pass pipeline. MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2); + if (enableOpenMP) + config.EnableOpenMP = true; config.NSWOnLoopVarInc = setNSW; fir::registerDefaultInlinerPass(config); fir::createDefaultFIROptimizerPassPipeline(pm, config); diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp index 5c373c4e85258..eaf4bae088454 100644 --- a/flang/tools/tco/tco.cpp +++ b/flang/tools/tco/tco.cpp @@ -139,6 +139,7 @@ compileFIR(const mlir::PassPipelineCLParser &passPipeline) { return mlir::failure(); } else { MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2); + config.EnableOpenMP = true; // assume the input contains OpenMP config.AliasAnalysis = true; // enabled when optimizing for speed if (codeGenLLVM) { // Run only CodeGen passes.