MLIR 23.0.0git
ACCRoutineToGPUFunc.cpp
Go to the documentation of this file.
1//===- ACCRoutineToGPUFunc.cpp - Move ACC routines to GPU module ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The OpenACC `routine` directive defines functions that may be invoked from
10// device code. Those functions need to be available in the device compilation
11// unit. This pass moves materialized acc routines into the GPU module as
12// gpu.func operations so they can be compiled for the device.
13//
14// Overview:
15// ---------
16// For each acc.routine that is not bound by name, the corresponding
17// specialized function (created by ACCRoutineLowering) or the original
18// host function (in case of seq) is cloned into theGPU module as a gpu.func.
19// Callees referenced from those routines are processed: device-valid callees
20// (runtime, intrinsics, other acc routines) are added to the GPU module as
21// declarations or full clones as needed. Bind-name routines are not moved;
22// their acc.routine ops are erased. After cloning, the host copies of
23// specialized device functions and nohost routines are removed.
24//
25// Approach:
26// ----------------
27// 1. Collect materialized routines (acc.routine without bind(name)); record
28// bind-name routines for erasure. Emit remarks for materialized routines.
29//
30// 2. Process calls: walk each materialized function; for each call, if the
31// callee is already in the GPU module or is an acc routine (or specialized
32// acc routine), skip; otherwise require OpenACCSupport::isValidSymbolUse.
33// Valid callees are added to the clone set (as declaration or full clone).
34//
35// 3. Clone into GPU module: each function in the clone set is turned into a
36// gpu.func (body cloned or declaration only). acc.specialized_routine is
37// preserved and symbol uses are updated so the routine name is unchanged.
38//
39// 4. Cleanup: erase from the host module the specialized device function
40// bodies and any nohost routine (host copy removed after move to device).
41//
42// Example:
43// --------
44// Before (after ACCRoutineLowering):
45// acc.routine @r_seq func(@foo) seq
46// func.func @foo() attributes {acc.specialized_routine = ...} { ... }
47//
48// After:
49// acc.routine @r_seq func(@foo) seq
50// gpu.module @acc_gpu_module {
51// gpu.func @foo() attributes {acc.specialized_routine = ...} { ... }
52// }
53// (host @foo erased)
54//
55// Requirements:
56// -------------
57// - Must run after `ACCRoutineLowering` pass which ensures variants for all
58// levels of parallelism are created.
59// - Uses OpenACCSupport: getOrCreateGPUModule, isValidSymbolUse, emitRemark,
60// emitNYI. If no custom implementation is registered, the default is used.
61//
62//===----------------------------------------------------------------------===//
63
65
71#include "mlir/IR/IRMapping.h"
72#include "mlir/IR/SymbolTable.h"
74#include "llvm/ADT/SetVector.h"
75#include <string>
76
77namespace mlir {
78namespace acc {
79#define GEN_PASS_DEF_ACCROUTINETOGPUFUNC
80#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
81} // namespace acc
82} // namespace mlir
83
84#define DEBUG_TYPE "acc-routine-to-gpu-func"
85
86using namespace mlir;
87using namespace mlir::acc;
88
89namespace {
90
91/// Create a gpu.func from a func.func by cloning the body.
92static gpu::GPUFuncOp createGPUFuncFromFunc(OpBuilder &builder,
93 func::FuncOp sourceFunc) {
94 Location loc = sourceFunc.getLoc();
95 StringRef name = sourceFunc.getName();
96 FunctionType type = sourceFunc.getFunctionType();
97 // Do not copy any attributes from the source; specialized_routine is set
98 // later when applicable.
99 gpu::GPUFuncOp gpuFunc =
100 gpu::GPUFuncOp::create(builder, loc, name, type,
101 /*workgroupAttributions=*/TypeRange(),
102 /*privateAttributions=*/TypeRange(), /*attrs=*/{});
103
104 Region &sourceBody = sourceFunc.getBody();
105 Region &deviceBody = gpuFunc.getBody();
106 Block &deviceEntryBlock = deviceBody.front();
107
108 // Map source block arguments to the GPU func's entry block arguments (which
109 // GPUFuncOp::create already created).
110 IRMapping mapping;
111 Block &sourceEntryBlock = sourceBody.front();
112 for (auto [srcArg, destArg] : llvm::zip(sourceEntryBlock.getArguments(),
113 deviceEntryBlock.getArguments()))
114 mapping.map(srcArg, destArg);
115
116 sourceBody.cloneInto(&deviceBody, mapping);
117
118 // Replace func.return with gpu.return in the cloned blocks.
119 gpuFunc.walk([](func::ReturnOp op) {
120 OpBuilder replacer(op);
121 gpu::ReturnOp gpuReturn = gpu::ReturnOp::create(replacer, op.getLoc());
122 gpuReturn->setOperands(op.getOperands());
123 op.erase();
124 });
125
126 // Splice the cloned entry block's operations into the GPU func's entry block
127 // (cloneInto created a separate block for the cloned content), then remove
128 // the now-empty cloned block.
129 Block *clonedSourceEntry = mapping.lookup(&sourceEntryBlock);
130 deviceEntryBlock.getOperations().splice(
131 deviceEntryBlock.getOperations().end(),
132 clonedSourceEntry->getOperations());
133 clonedSourceEntry->erase();
134
135 return gpuFunc;
136}
137
138using CloneCandidate = std::pair<func::FuncOp, RoutineOp>;
139
140/// Collect materialized and bind routines; fill candidate func names and
141/// materialized routine set. Emit remarks for materialized routines.
142static void collectRoutineCandidates(
143 ModuleOp mod, SymbolTable &symTab, acc::DeviceType deviceType,
144 OpenACCSupport &accSupport,
145 llvm::SmallSetVector<llvm::StringRef, 4> &funcsToCloneCandidates,
146 llvm::SmallSetVector<RoutineOp, 4> &materializedAccRoutines,
147 llvm::SmallSetVector<RoutineOp, 4> &bindAccRoutines) {
148 auto isParallelRoutine = [deviceType](RoutineOp routineOp) {
149 return routineOp.hasGang(deviceType) || routineOp.hasGang() ||
150 routineOp.hasWorker(deviceType) || routineOp.hasWorker() ||
151 routineOp.hasVector(deviceType) || routineOp.hasVector() ||
152 routineOp.getGangDimValue(deviceType) || routineOp.getGangDimValue();
153 };
154
155 mod.walk([&](RoutineOp op) {
156 if (op.getBindNameValue() || op.getBindNameValue(deviceType)) {
157 bindAccRoutines.insert(op);
158 return;
159 }
160 func::FuncOp callee =
161 symTab.lookup<func::FuncOp>(op.getFuncName().getLeafReference());
162 accSupport.emitRemark(
163 callee ? callee.getOperation() : op.getOperation(),
164 [&op, &isParallelRoutine]() {
165 std::string msg = "Generating";
166 if (op.getImplicitAttr())
167 msg += " implicit";
168 msg += " acc routine";
169 if (!isParallelRoutine(op))
170 msg += " seq";
171 return msg;
172 },
173 DEBUG_TYPE);
174 funcsToCloneCandidates.insert(op.getFuncName().getLeafReference());
175 materializedAccRoutines.insert(op);
176 });
177}
178
179/// Process calls in ACC routines: add valid callees to funcsToClone (for
180/// declaration or clone). Returns failure() if any call is unsupported.
181static LogicalResult processCallsInRoutines(
182 SymbolTable &symTab, SymbolTable &gpuSymTab, OpenACCSupport &accSupport,
183 const llvm::SmallSetVector<llvm::StringRef, 4> &funcsToCloneCandidates,
184 const llvm::SmallSetVector<RoutineOp, 4> &materializedAccRoutines,
185 llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
186 LogicalResult callCheckResult = success();
187 auto processCalls = [&](CallOpInterface callOp) {
188 if (!callOp.getCallableForCallee())
189 return;
190 auto calleeSymbolRef =
191 dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
192 if (!calleeSymbolRef)
193 return;
194
195 auto callee =
196 symTab.lookup<func::FuncOp>(calleeSymbolRef.getLeafReference());
197 if (!callee)
198 return;
199
200 if (gpuSymTab.lookup(callee.getName()))
201 return;
202 if (isAccRoutine(callee) || isSpecializedAccRoutine(callee))
203 return;
204
205 if (!accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef)) {
206 accSupport.emitNYI(callOp->getLoc(), "Unsupported call in acc routine");
207 callCheckResult = failure();
208 return;
209 }
210 funcsToClone.insert({callee, RoutineOp{}});
211 };
212
213 for (auto [funcName, accRoutine] :
214 llvm::zip(funcsToCloneCandidates, materializedAccRoutines)) {
215 func::FuncOp func = symTab.lookup<func::FuncOp>(funcName);
216 if (!func)
217 continue;
218 if (!gpuSymTab.lookup(funcName))
219 funcsToClone.insert({func, accRoutine});
220 func.walk([&](CallOpInterface callOp) { processCalls(callOp); });
221 if (failed(callCheckResult))
222 return failure();
223 }
224 return success();
225}
226
227/// Clone each function in funcsToClone into the GPU module (declaration or
228/// full body). Fix up symbol names and specialized_routine attr for ACC
229/// routines.
230static LogicalResult cloneFuncsToGPUModule(
231 ModuleOp mod, OpenACCSupport &accSupport, SymbolTable &gpuSymTab,
232 const llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
233 MLIRContext *ctx = mod.getContext();
234 OpBuilder builder(ctx);
235
236 for (CloneCandidate candidate : funcsToClone) {
237 func::FuncOp srcFunc = candidate.first;
238
239 if (srcFunc.isDeclaration()) {
240 Operation *cloned = srcFunc->clone();
241 gpuSymTab.insert(cloned);
242 continue;
243 }
244
245 gpu::GPUFuncOp deviceFuncOp = createGPUFuncFromFunc(builder, srcFunc);
246
247 if (auto specRoutineAttr = srcFunc->getAttrOfType<SpecializedRoutineAttr>(
249 StringAttr funcName = specRoutineAttr.getFuncName();
251 StringAttr::get(ctx, deviceFuncOp.getName()), funcName, mod))) {
252 accSupport.emitNYI(deviceFuncOp.getLoc(),
253 "cannot replace symbol for acc routine");
254 return failure();
255 }
256 deviceFuncOp->setAttr(SymbolTable::getSymbolAttrName(), funcName);
257 }
258 if (auto specAttr = srcFunc->getAttrOfType<SpecializedRoutineAttr>(
260 deviceFuncOp->setAttr(getSpecializedRoutineAttrName(), specAttr);
261
262 gpuSymTab.insert(deviceFuncOp);
263 }
264 return success();
265}
266
267/// Remove specialized device copies and nohost routines from the host module.
268static void
269cleanupHostModule(const llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
270 for (CloneCandidate candidate : funcsToClone) {
271 func::FuncOp funcCandidate = candidate.first;
272 RoutineOp routineCandidate = candidate.second;
273 if ((routineCandidate && routineCandidate.getNohost()) ||
274 acc::isSpecializedAccRoutine(funcCandidate))
275 funcCandidate.erase();
276 }
277}
278
279class ACCRoutineToGPUFunc
280 : public acc::impl::ACCRoutineToGPUFuncBase<ACCRoutineToGPUFunc> {
281public:
282 using acc::impl::ACCRoutineToGPUFuncBase<
283 ACCRoutineToGPUFunc>::ACCRoutineToGPUFuncBase;
284
285 void runOnOperation() override {
286 ModuleOp mod = getOperation();
287 if (mod.getOps<RoutineOp>().empty()) {
288 LLVM_DEBUG(llvm::dbgs()
289 << "Skipping ACCRoutineToGPUFunc - no acc.routine ops\n");
290 return;
291 }
292
293 OpenACCSupport &accSupport = getAnalysis<OpenACCSupport>();
294 std::optional<gpu::GPUModuleOp> gpuModOpt =
295 accSupport.getOrCreateGPUModule(mod);
296 if (!gpuModOpt) {
297 accSupport.emitNYI(mod.getLoc(), "Failed to create GPU module");
298 return signalPassFailure();
299 }
300 gpu::GPUModuleOp gpuMod = *gpuModOpt;
301
302 SymbolTable symTab(mod);
303 SymbolTable gpuSymTab(gpuMod);
304
305 llvm::SmallSetVector<llvm::StringRef, 4> funcsToCloneCandidates;
306 llvm::SmallSetVector<RoutineOp, 4> materializedAccRoutines;
307 llvm::SmallSetVector<RoutineOp, 4> bindAccRoutines;
308
309 collectRoutineCandidates(mod, symTab, this->deviceType, accSupport,
310 funcsToCloneCandidates, materializedAccRoutines,
311 bindAccRoutines);
312
313 llvm::SmallSetVector<CloneCandidate, 4> funcsToClone;
314 if (failed(processCallsInRoutines(symTab, gpuSymTab, accSupport,
315 funcsToCloneCandidates,
316 materializedAccRoutines, funcsToClone)))
317 return signalPassFailure();
318
319 if (failed(cloneFuncsToGPUModule(mod, accSupport, gpuSymTab, funcsToClone)))
320 return signalPassFailure();
321
322 cleanupHostModule(funcsToClone);
323 for (RoutineOp bindOp : bindAccRoutines)
324 bindOp.erase();
325 }
326};
327
328} // namespace
return success()
#define DEBUG_TYPE
Block represents an ordered list of Operations.
Definition Block.h:33
void erase()
Unlink this Block from its parent region and delete it.
Definition Block.cpp:66
OpListType & getOperations()
Definition Block.h:147
Operation & front()
Definition Block.h:163
BlockArgListType getArguments()
Definition Block.h:97
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class helps build Operations.
Definition Builders.h:209
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
Operation * clone(IRMapping &mapper, const CloneOptions &options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition Region.h:26
Block & front()
Definition Region.h:65
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Definition SymbolTable.h:24
static StringRef getSymbolAttrName()
Return the name of the attribute used for symbol names.
Definition SymbolTable.h:76
static LogicalResult replaceAllSymbolUses(StringAttr oldSymbol, StringAttr newSymbol, Operation *from)
Attempt to replace all uses of the given symbol 'oldSymbol' with the provided symbol 'newSymbol' that...
Operation * lookup(StringRef name) const
Look up a symbol with the specified name, returning null if no such name exists.
StringAttr insert(Operation *symbol, Block::iterator insertPt={})
Insert a new symbol into the table, and rename it as necessary to avoid collisions.
remark::detail::InFlightRemark emitRemark(Operation *op, std::function< std::string()> messageFn, llvm::StringRef category="openacc")
Emit an OpenACC remark with lazy message generation.
InFlightDiagnostic emitNYI(Location loc, const Twine &message)
Report a case that is not yet supported by the implementation.
bool isValidSymbolUse(Operation *user, SymbolRefAttr symbol, Operation **definingOpPtr=nullptr)
Check if a symbol use is valid for use in an OpenACC region.
std::optional< gpu::GPUModuleOp > getOrCreateGPUModule(ModuleOp mod, bool create=true, llvm::StringRef name="")
Get or optionally create a GPU module in the given module.
bool isAccRoutine(mlir::Operation *op)
Used to check whether the current operation is marked with acc routine.
Definition OpenACC.h:192
static constexpr StringLiteral getSpecializedRoutineAttrName()
Definition OpenACC.h:186
bool isSpecializedAccRoutine(mlir::Operation *op)
Used to check whether this is a specialized accelerator version of acc routine function.
Definition OpenACC.h:198
Include the generated interface declarations.