MLIR 22.0.0git
ACCSpecializeForHost.cpp
Go to the documentation of this file.
1//===- ACCSpecializeForHost.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass converts OpenACC operations to host-compatible representations,
10// enabling execution on the host rather than on accelerator devices.
11//
12// Overview:
13// ---------
14// The pass operates in two modes depending on the `enableHostFallback` option:
15//
16// 1. Default Mode (Orphan Operations Only):
17// Only converts "orphan" ACC operations that are not inside or attached to
18// compute regions. This is used for host routines (acc routine marked for
19// host) where structured/unstructured data constructs, compute constructs,
20// and their associated data operations should be preserved.
21//
22// 2. Host Fallback Mode (enableHostFallback=true):
23// Converts ALL ACC operations within the region to host equivalents. This
24// is used when the `if` clause evaluates to false at runtime and the
25// entire ACC region needs to fall back to host execution.
26//
27// Transformations (Orphan Mode):
28// ------------------------------
29// The following orphan operations are converted:
30//
31// 1. Atomic Ops (converted to load/store):
32// acc.atomic.update -> load + compute + store
33// acc.atomic.read -> load + store (copy)
34// acc.atomic.write -> store
35// acc.atomic.capture -> inline region contents
36//
37// 2. Loop Ops (converted to SCF):
38// acc.loop (structured) -> scf.for
39// acc.loop (unstructured) -> scf.execute_region
40//
41// 3. Orphan Data Entry Ops (replaced with var operand):
42// acc.cache, acc.private, acc.firstprivate, acc.reduction
43// (only if NOT connected to compute constructs or loop)
44//
45// Transformations (Host Fallback Mode):
46// -------------------------------------
47// In addition to orphan transformations, ALL of the following are converted:
48//
49// 1. Data Entry Ops (replaced with var operand):
50// acc.copyin, acc.create, acc.attach, acc.present, acc.deviceptr,
51// acc.get_deviceptr, acc.nocreate, acc.declare_device_resident,
52// acc.declare_link, acc.use_device, acc.update_device
53//
54// 2. Data Exit Ops (erased):
55// acc.copyout, acc.delete, acc.detach, acc.update_host
56//
57// 3. Structured Data/Compute Constructs (region inlined):
58// acc.data, acc.host_data, acc.kernel_environment, acc.declare,
59// acc.parallel, acc.serial, acc.kernels
60//
61// 4. Unstructured Data Ops (erased):
62// acc.enter_data, acc.exit_data, acc.update
63//
64// 5. Declare Ops (erased):
65// acc.declare_enter, acc.declare_exit
66//
67// 6. Runtime Ops (erased):
68// acc.init, acc.shutdown, acc.set, acc.wait, acc.terminator
69//
70// Requirements:
71// -------------
72// For atomic operation conversion, variables must implement the
73// `acc::PointerLikeType` interface to enable generating load/store operations.
74//
75// The pass uses `OpenACCSupport::emitNYI()` to report unsupported cases.
76//
77//===----------------------------------------------------------------------===//
78
80
87#include "mlir/IR/BuiltinOps.h"
89#include "mlir/IR/Operation.h"
92
93namespace mlir {
94namespace acc {
95#define GEN_PASS_DEF_ACCSPECIALIZEFORHOST
96#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
97} // namespace acc
98} // namespace mlir
99
100#define DEBUG_TYPE "acc-specialize-for-host"
101
102using namespace mlir;
103using namespace mlir::acc;
104
105/// Check if an operation is inside an ACC compute construct.
107 while ((op = op->getParentOp()))
108 if (isa<ACC_COMPUTE_CONSTRUCT_OPS>(op))
109 return true;
110 return false;
111}
112
113namespace {
114
115// Lower orphan acc.atomic.update by: load from addr, clone region expr with
116// the loaded value, then store the computed result back to addr.
117// Only matches if NOT inside a compute region.
118class ACCOrphanAtomicUpdateOpConversion
119 : public OpRewritePattern<acc::AtomicUpdateOp> {
120public:
121 ACCOrphanAtomicUpdateOpConversion(MLIRContext *ctx, OpenACCSupport &support)
122 : OpRewritePattern<acc::AtomicUpdateOp>(ctx), accSupport(support) {}
123
124 LogicalResult matchAndRewrite(acc::AtomicUpdateOp atomicUpdateOp,
125 PatternRewriter &rewriter) const override {
126 // Only convert if this op is not inside an ACC compute construct
127 if (isInsideACCComputeConstruct(atomicUpdateOp))
128 return failure();
129
130 Value x = atomicUpdateOp.getX();
131 Type type = x.getType();
132 auto ptrLikeType = dyn_cast<acc::PointerLikeType>(type);
133 if (ptrLikeType) {
134 auto xTyped = cast<TypedValue<acc::PointerLikeType>>(x);
135 rewriter.setInsertionPointAfter(atomicUpdateOp);
136 Value loadOp =
137 ptrLikeType.genLoad(rewriter, atomicUpdateOp.getLoc(), xTyped, {});
138 if (!loadOp) {
139 accSupport.emitNYI(atomicUpdateOp.getLoc(),
140 "failed to generate load for atomic update");
141 return failure();
142 }
143 IRMapping mapping;
144 mapping.map(atomicUpdateOp.getRegion().front().getArgument(0), loadOp);
145 Operation *expr = rewriter.clone(*atomicUpdateOp.getFirstOp(), mapping);
146 if (!ptrLikeType.genStore(rewriter, atomicUpdateOp.getLoc(),
147 expr->getResult(0), xTyped)) {
148 accSupport.emitNYI(atomicUpdateOp.getLoc(),
149 "failed to generate store for atomic update");
150 return failure();
151 }
152 rewriter.eraseOp(atomicUpdateOp);
153 } else {
154 accSupport.emitNYI(atomicUpdateOp.getLoc(),
155 "unsupported type for atomic update");
156 return failure();
157 }
158 return success();
159 }
160
161private:
162 OpenACCSupport &accSupport;
163};
164
165// Lower orphan acc.atomic.read by: load from src, then store into dst.
166// Only matches if NOT inside an ACC compute construct.
167class ACCOrphanAtomicReadOpConversion
168 : public OpRewritePattern<acc::AtomicReadOp> {
169public:
170 ACCOrphanAtomicReadOpConversion(MLIRContext *ctx, OpenACCSupport &support)
171 : OpRewritePattern<acc::AtomicReadOp>(ctx), accSupport(support) {}
172
173 LogicalResult matchAndRewrite(acc::AtomicReadOp readOp,
174 PatternRewriter &rewriter) const override {
175 // Only convert if this op is not inside an ACC compute construct
176 if (isInsideACCComputeConstruct(readOp))
177 return failure();
178
179 Value x = readOp.getX();
180 Value v = readOp.getV();
181 auto xPtrType = dyn_cast<acc::PointerLikeType>(x.getType());
182 auto vPtrType = dyn_cast<acc::PointerLikeType>(v.getType());
183 if (xPtrType && vPtrType) {
184 auto xTyped = cast<TypedValue<acc::PointerLikeType>>(x);
185 auto vTyped = cast<TypedValue<acc::PointerLikeType>>(v);
186 rewriter.setInsertionPointAfter(readOp);
187
188 // Use genCopy which does load + store
189 if (!xPtrType.genCopy(rewriter, readOp.getLoc(), vTyped, xTyped, {})) {
190 accSupport.emitNYI(readOp.getLoc(),
191 "failed to generate copy for atomic read");
192 return failure();
193 }
194 rewriter.eraseOp(readOp);
195 } else {
196 accSupport.emitNYI(readOp.getLoc(), "unsupported type for atomic read");
197 return failure();
198 }
199 return success();
200 }
201
202private:
203 OpenACCSupport &accSupport;
204};
205
206// Lower orphan acc.atomic.write by: store value into addr.
207// Only matches if NOT inside an ACC compute construct.
208class ACCOrphanAtomicWriteOpConversion
209 : public OpRewritePattern<acc::AtomicWriteOp> {
210public:
211 ACCOrphanAtomicWriteOpConversion(MLIRContext *ctx, OpenACCSupport &support)
212 : OpRewritePattern<acc::AtomicWriteOp>(ctx), accSupport(support) {}
213
214 LogicalResult matchAndRewrite(acc::AtomicWriteOp writeOp,
215 PatternRewriter &rewriter) const override {
216 // Only convert if this op is not inside an ACC compute construct
217 if (isInsideACCComputeConstruct(writeOp))
218 return failure();
219
220 Value x = writeOp.getX();
221 Value expr = writeOp.getExpr();
222 auto ptrLikeType = dyn_cast<acc::PointerLikeType>(x.getType());
223 if (ptrLikeType) {
224 auto xTyped = cast<TypedValue<acc::PointerLikeType>>(x);
225 rewriter.setInsertionPointAfter(writeOp);
226 if (!ptrLikeType.genStore(rewriter, writeOp.getLoc(), expr, xTyped)) {
227 accSupport.emitNYI(writeOp.getLoc(),
228 "failed to generate store for atomic write");
229 return failure();
230 }
231 rewriter.eraseOp(writeOp);
232 } else {
233 accSupport.emitNYI(writeOp.getLoc(), "unsupported type for atomic write");
234 return failure();
235 }
236 return success();
237 }
238
239private:
240 OpenACCSupport &accSupport;
241};
242
243// Lower orphan acc.atomic.capture by: unwrap the capture region and erase the
244// wrapper; inner ops are lowered in-order (e.g., read+update becomes load/store
245// to dst then load/compute/store to addr).
246// Only matches if NOT inside an ACC compute construct.
247class ACCOrphanAtomicCaptureOpConversion
248 : public OpRewritePattern<acc::AtomicCaptureOp> {
249 using OpRewritePattern<acc::AtomicCaptureOp>::OpRewritePattern;
250
251 LogicalResult matchAndRewrite(acc::AtomicCaptureOp captureOp,
252 PatternRewriter &rewriter) const override {
253 // Only convert if this op is not inside an ACC compute construct
254 if (isInsideACCComputeConstruct(captureOp))
255 return failure();
256
257 assert(captureOp.getRegion().hasOneBlock() && "expected one block");
258 Block *block = &captureOp.getRegion().front();
259 // Remove the terminator before inlining
260 rewriter.eraseOp(block->getTerminator());
261 rewriter.inlineBlockBefore(block, captureOp);
262 rewriter.eraseOp(captureOp);
263 return success();
264 }
265};
266
267// Convert orphan acc.loop to scf.for or scf.execute_region.
268// Only matches if NOT inside an ACC compute construct.
269class ACCOrphanLoopOpConversion : public OpRewritePattern<acc::LoopOp> {
270 using OpRewritePattern<acc::LoopOp>::OpRewritePattern;
271
272 LogicalResult matchAndRewrite(acc::LoopOp loopOp,
273 PatternRewriter &rewriter) const override {
274 // Only convert if this op is not inside an ACC compute construct
275 if (isInsideACCComputeConstruct(loopOp))
276 return failure();
277
278 if (loopOp.getUnstructured()) {
279 auto executeRegion =
281 if (!executeRegion)
282 return failure();
283 rewriter.replaceOp(loopOp, executeRegion);
284 } else {
285 auto forOp = acc::convertACCLoopToSCFFor(loopOp, rewriter,
286 /*enableCollapse=*/false);
287 if (!forOp)
288 return failure();
289 rewriter.replaceOp(loopOp, forOp);
290 }
291 return success();
292 }
293};
294
295/// Check if an operation is used by a compute construct or loop op
296static bool isUsedByComputeOrLoop(Operation *op) {
297 for (auto *user : op->getUsers())
298 if (isa<acc::ParallelOp, acc::SerialOp, acc::KernelsOp, acc::LoopOp>(user))
299 return true;
300 return false;
301}
302
303/// Orphan data entry ops - only match if NOT connected to compute/loop and
304/// NOT inside a compute region. Used for acc.cache, acc.private,
305/// acc.firstprivate, acc.reduction.
306template <typename OpTy>
307class ACCOrphanDataEntryConversion : public OpRewritePattern<OpTy> {
308 using OpRewritePattern<OpTy>::OpRewritePattern;
309
310 LogicalResult matchAndRewrite(OpTy op,
311 PatternRewriter &rewriter) const override {
312 // Only convert if this op is not used by a compute construct or loop,
313 // and not inside an ACC compute construct.
314 if (isUsedByComputeOrLoop(op) || isInsideACCComputeConstruct(op))
315 return failure();
316
317 if (op->use_empty())
318 rewriter.eraseOp(op);
319 else
320 rewriter.replaceOp(op, op.getVar());
321 return success();
322 }
323};
324
325class ACCSpecializeForHost
326 : public acc::impl::ACCSpecializeForHostBase<ACCSpecializeForHost> {
327public:
328 using ACCSpecializeForHostBase<
329 ACCSpecializeForHost>::ACCSpecializeForHostBase;
330
331 void runOnOperation() override {
332 LLVM_DEBUG(llvm::dbgs() << "Enter ACCSpecializeForHost()\n");
333
334 func::FuncOp funcOp = getOperation();
335 if (!acc::isSpecializedAccRoutine(funcOp)) {
336 // Convert orphan operations to host, or all ACC operations if
337 // host fallback patterns are enabled.
338 auto *context = &getContext();
339 RewritePatternSet patterns(context);
340 OpenACCSupport &accSupport = getAnalysis<OpenACCSupport>();
341 if (enableHostFallback)
343 else
345 GreedyRewriteConfig config;
346 config.setUseTopDownTraversal(true);
347 if (failed(applyPatternsGreedily(funcOp, std::move(patterns), config)))
348 signalPassFailure();
349 }
350
351 LLVM_DEBUG(llvm::dbgs() << "Exit ACCSpecializeForHost()\n");
352 }
353};
354} // namespace
355
356//===----------------------------------------------------------------------===//
357// Pattern population functions
358//===----------------------------------------------------------------------===//
359
361 OpenACCSupport &accSupport,
362 bool enableLoopConversion) {
363 MLIRContext *context = patterns.getContext();
364
365 // For host routines (acc routine marked for host), we only convert orphan
366 // operations that are not allowed outside compute regions. All patterns
367 // here check that the operation is NOT inside a compute region before
368 // converting:
369 // - acc.atomic.* -> load/store operations
370 // - acc.loop -> scf.for or scf.execute_region
371 // - acc.cache -> replaced with var
372 // - acc.private, acc.reduction, acc.firstprivate -> replaced with var
373 // (only if NOT connected to compute constructs or loop)
374 //
375 // We do NOT remove structured/unstructured data constructs, compute
376 // constructs, or their associated data operations - those are valid
377 // in host routines and will be processed by other passes.
378
379 // Loop conversion (orphan only)
380 if (enableLoopConversion)
381 patterns.insert<ACCOrphanLoopOpConversion>(context);
382
383 // Atomic operations - convert to non-atomic load/store (orphan only)
384 patterns.insert<ACCOrphanAtomicUpdateOpConversion>(context, accSupport);
385 patterns.insert<ACCOrphanAtomicReadOpConversion>(context, accSupport);
386 patterns.insert<ACCOrphanAtomicWriteOpConversion>(context, accSupport);
387 patterns.insert<ACCOrphanAtomicCaptureOpConversion>(context);
388
389 // Orphan data entry ops - only convert if NOT connected to compute/loop
390 // and NOT inside a compute region
391 patterns.insert<ACCOrphanDataEntryConversion<acc::CacheOp>,
392 ACCOrphanDataEntryConversion<acc::PrivateOp>,
393 ACCOrphanDataEntryConversion<acc::FirstprivateOp>,
394 ACCOrphanDataEntryConversion<acc::ReductionOp>>(context);
395}
396
398 OpenACCSupport &accSupport,
399 bool enableLoopConversion) {
400 MLIRContext *context = patterns.getContext();
401
402 // For host fallback path (when `if` clause evaluates to false), ALL ACC
403 // operations within the region should be converted to host equivalents.
404 // This includes structured/unstructured data, compute constructs, and
405 // their associated data operations.
406
407 // Loop conversion - OK to use the orphan loop conversion pattern here
408 // because the parent compute constructs will also be converted.
409 if (enableLoopConversion)
410 patterns.insert<ACCOrphanLoopOpConversion>(context);
411
412 // Atomic operations - convert to non-atomic load/store. OK to use the orphan
413 // atomic conversion patterns here because the parent compute constructs will
414 // also be converted.
415 patterns.insert<ACCOrphanAtomicUpdateOpConversion>(context, accSupport);
416 patterns.insert<ACCOrphanAtomicReadOpConversion>(context, accSupport);
417 patterns.insert<ACCOrphanAtomicWriteOpConversion>(context, accSupport);
418 patterns.insert<ACCOrphanAtomicCaptureOpConversion>(context);
419
420 // acc.cache - convert ALL cache ops (including those inside compute regions)
422
423 // Privatization ops - convert ALL (including those attached to compute/loop)
427
428 // Data entry ops - replaced with their var operand
440
441 // Data exit ops - simply erased (no results)
446
447 // Structured data constructs - unwrap their regions
451
452 // Declare ops
455
456 // Unstructured data operations - erase them
460
461 // Runtime operations - erase them
462 patterns.insert<
466
467 // Compute constructs - unwrap their regions
471}
static bool isInsideACCComputeConstruct(Operation *op)
Check if an operation is inside an ACC compute construct.
return success()
b getContext())
Operation * getTerminator()
Get the terminator operation of this block.
Definition Block.cpp:244
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:562
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:412
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:407
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition Operation.h:234
user_range getUsers()
Returns a range of all users.
Definition Operation.h:873
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
virtual void inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues={})
Inline the operations of block 'source' into block 'dest' before the given position.
Type getType() const
Return the type of this value.
Definition Value.h:105
Pattern to erase acc.declare_enter and its associated acc.declare_exit.
Pattern to simply erase an ACC op (for ops with no results).
Pattern to replace an ACC op with its var operand.
Pattern to unwrap a region from an ACC op and erase the wrapper.
bool isSpecializedAccRoutine(mlir::Operation *op)
Used to check whether this is a specialized accelerator version of acc routine function.
Definition OpenACC.h:195
scf::ExecuteRegionOp convertUnstructuredACCLoopToSCFExecuteRegion(LoopOp loopOp, RewriterBase &rewriter)
Convert an unstructured acc.loop to scf.execute_region.
void populateACCOrphanToHostPatterns(RewritePatternSet &patterns, OpenACCSupport &accSupport, bool enableLoopConversion=true)
Populates patterns for converting orphan ACC operations to host.
void populateACCHostFallbackPatterns(RewritePatternSet &patterns, OpenACCSupport &accSupport, bool enableLoopConversion=true)
Populates all patterns for host fallback path (when if clause evaluates to false).
scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, RewriterBase &rewriter, bool enableCollapse)
Convert a structured acc.loop to scf.for.
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:573
Include the generated interface declarations.
const FrozenRewritePatternSet GreedyRewriteConfig config
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
const FrozenRewritePatternSet & patterns
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...