MLIR 23.0.0git
LowerGpuOpsToROCDLOps.cpp
Go to the documentation of this file.
1//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to generate ROCDLIR operations for higher-level
10// GPU operations.
11//
12//===----------------------------------------------------------------------===//
13
16#include "mlir/Pass/Pass.h"
18
39#include "mlir/IR/Matchers.h"
42
45
46namespace mlir {
47#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
48#include "mlir/Conversion/Passes.h.inc"
49} // namespace mlir
50
51using namespace mlir;
52
53// Truncate or extend the result depending on the index bitwidth specified
54// by the LLVMTypeConverter options.
55static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
56 Location loc, Value value,
57 const LLVMTypeConverter &converter) {
58 int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
59 int64_t indexBitwidth = converter.getIndexTypeBitwidth();
60 auto indexBitwidthType =
61 IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
62 // TODO: use <=> in C++20.
63 if (indexBitwidth > intWidth) {
64 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
65 }
66 if (indexBitwidth < intWidth) {
67 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
68 }
69 return value;
70}
71
72/// Returns true if the given `gpu.func` can be safely called using the bare
73/// pointer calling convention.
74static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
75 bool canBeBare = true;
76 for (Type type : func.getArgumentTypes())
77 if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
78 canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
79 return canBeBare;
80}
81
82static Value getLaneId(RewriterBase &rewriter, Location loc) {
83 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
84 Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
85 Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
86 NamedAttribute noundef = rewriter.getNamedAttr(
87 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr());
88 NamedAttribute lowRange = rewriter.getNamedAttr(
89 LLVM::LLVMDialect::getRangeAttrName(),
90 LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
91 APInt(32, 32)));
92 NamedAttribute highRange = rewriter.getNamedAttr(
93 LLVM::LLVMDialect::getRangeAttrName(),
94 LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
95 APInt(32, 64)));
96 Value mbcntLo = ROCDL::MbcntLoOp::create(
97 rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{},
98 /*res_attrs=*/
99 rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange})));
100 Value laneId = ROCDL::MbcntHiOp::create(
101 rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{},
102 rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange})));
103 return laneId;
104}
105
106/// Maximum number of threads per block dimension on AMD GPUs.
107static constexpr int64_t kMaxThreadsPerBlockDim = 1024;
108
109/// Emits a call to an OCKL block/grid size function corresponding to
110/// `indexKind` with argument `dim`, except that if the context around
111/// `contextOp` gives an exact size for that dimension, return that as
112/// an `i64` constant instead.
115 gpu::Dimension dim, Operation *contextOp,
116 std::optional<uint32_t> opUpperBound) {
117 Location loc = contextOp->getLoc();
118 MLIRContext *context = contextOp->getContext();
119
120 auto i32Ty = IntegerType::get(context, 32);
121 auto i64Ty = IntegerType::get(context, 64);
122
123 if (std::optional<uint32_t> knownDim =
124 gpu::getKnownDimensionSizeAround(contextOp, indexKind, dim))
125 return LLVM::ConstantOp::create(rewriter, loc,
126 rewriter.getI64IntegerAttr(*knownDim));
127
128 int32_t dimParam = static_cast<int32_t>(dim);
129
130 StringRef functionName;
131 switch (indexKind) {
132 case gpu::index_lowering::IndexKind::Block:
133 functionName = "__ockl_get_local_size";
134 break;
135 case gpu::index_lowering::IndexKind::Grid:
136 functionName = "__ockl_get_num_groups";
137 break;
138 case gpu::index_lowering::IndexKind::Cluster:
139 case gpu::index_lowering::IndexKind::Other:
140 llvm_unreachable("Not valid index kinds for ockl lookup");
141 }
142
143 // Declare the ockl function: i64 @functionName(i32).
144 auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty});
145 Operation *moduleOp = contextOp->getParentWithTrait<OpTrait::SymbolTable>();
146 LLVM::LLVMFuncOp funcOp =
147 getOrDefineFunction(moduleOp, loc, rewriter, functionName, fnType);
148
149 // Create the call.
150 Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam);
151 auto callOp =
152 LLVM::CallOp::create(rewriter, loc, funcOp, ValueRange{dimConst});
153
154 LLVM::ConstantRangeAttr range;
155 if (opUpperBound) {
156 range = LLVM::ConstantRangeAttr::get(
157 context, APInt(64, 1),
158 APInt(64, static_cast<uint64_t>(*opUpperBound) + 1));
159 } else if (indexKind == gpu::index_lowering::IndexKind::Block) {
160 // Set the hardware limit for block ranges as the bounds on block dim calls.
161 range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
162 APInt(64, kMaxThreadsPerBlockDim + 1));
163 }
164 if (range) {
165 callOp.setResAttrsAttr(rewriter.getArrayAttr(rewriter.getDictionaryAttr(
166 rewriter.getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
167 }
168 return callOp.getResult();
169}
170
171static constexpr StringLiteral amdgcnDataLayout =
172 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
173 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
174 "32-v32:"
175 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
176 "64-S32-A5-G1-ni:7:8:9";
177
178namespace {
179
180/// Lowers gpu.block_dim / gpu.grid_dim to direct __ockl_get_local_size /
181/// __ockl_get_num_groups function calls.
182template <typename OpTy>
183struct GPUDimOpToOcklCall final : ConvertOpToLLVMPattern<OpTy> {
184 GPUDimOpToOcklCall(const LLVMTypeConverter &converter,
186 : ConvertOpToLLVMPattern<OpTy>(converter), indexKind(indexKind) {}
187
188 LogicalResult
189 matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
190 ConversionPatternRewriter &rewriter) const override {
191 Location loc = op.getLoc();
192
193 std::optional<uint32_t> opUpperBound;
194 if (auto bound = op.getUpperBound())
195 opUpperBound = static_cast<uint32_t>(bound->getZExtValue());
196
197 Value ocklCall = getKnownOrOcklDim(rewriter, indexKind, op.getDimension(),
198 op, opUpperBound);
199 Value result = truncOrExtToLLVMType(rewriter, loc, ocklCall,
200 *this->getTypeConverter());
201 rewriter.replaceOp(op, result);
202 return success();
203 }
204
205private:
206 const gpu::index_lowering::IndexKind indexKind;
207};
208
209struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
211
212 LogicalResult
213 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
214 ConversionPatternRewriter &rewriter) const override {
215 Location loc = op.getLoc();
216 MLIRContext *context = rewriter.getContext();
217 // convert to:
218 // %mlo = call noundef range(i32 0, 32)
219 // @llvm.amdgcn.mbcnt.lo(-1, 0)
220 // followed by:
221 // %lid = call noundef range(i32 0, 64)
222 // @llvm.amdgcn.mbcnt.hi(-1, %mlo)
223
224 Value laneId = getLaneId(rewriter, loc);
225 // Truncate or extend the result depending on the index bitwidth specified
226 // by the LLVMTypeConverter options.
227 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
228 if (indexBitwidth > 32) {
229 laneId = LLVM::SExtOp::create(
230 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
231 } else if (indexBitwidth < 32) {
232 laneId = LLVM::TruncOp::create(
233 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
234 }
235 rewriter.replaceOp(op, {laneId});
236 return success();
237 }
238};
239
240struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
242
243 GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
244 amdgpu::Chipset chipset)
246 chipset(chipset) {}
247
248 LogicalResult
249 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
250 ConversionPatternRewriter &rewriter) const override {
251 LLVM::ConstantRangeAttr bounds = nullptr;
252 bool isBeforeGfx10 = chipset.majorVersion < 10;
253 if (auto upperBoundAttr = op.getUpperBoundAttr()) {
254 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
255 /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
256 /*upper=*/op.getUpperBoundAttr().getInt() + 1);
257 }
258 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
259 rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
260 wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
261 *getTypeConverter());
262 rewriter.replaceOp(op, {wavefrontOp});
263 return success();
264 }
265
266 const amdgpu::Chipset chipset;
267};
268
269struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
271
272 GPUSubgroupIdOpToROCDL(const LLVMTypeConverter &converter,
273 amdgpu::Chipset chipset)
274 : ConvertOpToLLVMPattern<gpu::SubgroupIdOp>(converter), chipset(chipset) {
275 }
276
277 LogicalResult
278 matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
279 ConversionPatternRewriter &rewriter) const override {
280 Location loc = op.getLoc();
281 auto int32Type = rewriter.getI32Type();
282
283 Value subgroupId;
284 if (chipset.majorVersion >= 12) {
285 // For gfx12+, use the hardware wave.id register directly.
286 LLVM::ConstantRangeAttr bounds;
287 if (auto upperBoundAttr = op.getUpperBoundAttr())
288 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
289 /*bitWidth=*/32, /*lower=*/0,
290 /*upper=*/upperBoundAttr.getInt());
291 subgroupId = ROCDL::WaveId::create(rewriter, loc, int32Type, bounds);
292 } else {
293 // For older architectures, compute:
294 // subgroup_id = linearized_thread_id / subgroup_size
295 // where linearized_thread_id = tid.x + dim.x * (tid.y + dim.y * tid.z)
296 auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
297 auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
298 auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
299 auto setBoundFromContext = [&](Operation *tidOp, gpu::Dimension dim) {
300 if (LLVM::ConstantRangeAttr range =
302 op, dim, std::nullopt,
303 gpu::index_lowering::IndexKind::Block,
305 tidOp->setAttr("range", range);
306 };
307 setBoundFromContext(tidX, gpu::Dimension::x);
308 setBoundFromContext(tidY, gpu::Dimension::y);
309 setBoundFromContext(tidZ, gpu::Dimension::z);
310
311 auto flags =
312 LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
313
314 auto getBlockDim = [&](gpu::Dimension dim) {
315 Value dim64 =
316 getKnownOrOcklDim(rewriter, gpu::index_lowering::IndexKind::Block,
317 dim, op, std::nullopt);
318 Value dimTrunc =
319 LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
320 return dimTrunc;
321 };
322 Value dimX = getBlockDim(gpu::Dimension::x);
323 Value dimY = getBlockDim(gpu::Dimension::y);
324
325 // linearized = tid.x + dim.x * (tid.y + dim.y * tid.z)
326 // Thread IDs and dimensions are non-negative and small, so use nuw+nsw.
327 Value dimYxTidZ =
328 LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
329 Value tidYPlusDimYxTidZ =
330 LLVM::AddOp::create(rewriter, loc, int32Type, tidY, dimYxTidZ, flags);
331 Value dimXxInner = LLVM::MulOp::create(rewriter, loc, int32Type, dimX,
332 tidYPlusDimYxTidZ, flags);
333 Value linearized = LLVM::AddOp::create(rewriter, loc, int32Type, tidX,
334 dimXxInner, flags);
335
336 Value subgroupSize =
337 ROCDL::WavefrontSizeOp::create(rewriter, loc, int32Type);
338 subgroupId = LLVM::UDivOp::create(rewriter, loc, int32Type, linearized,
339 subgroupSize);
340 }
341
342 subgroupId =
343 truncOrExtToLLVMType(rewriter, loc, subgroupId, *getTypeConverter());
344 rewriter.replaceOp(op, subgroupId);
345 return success();
346 }
347
348 const amdgpu::Chipset chipset;
349};
350
351static bool isSupportedReadLaneType(Type type) {
352 // https://llvm.org/docs/AMDGPUUsage.html#llvm-ir-intrinsics
353 if (isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
354 LLVM::LLVMPointerType>(type))
355 return true;
356
357 if (auto intType = dyn_cast<IntegerType>(type))
358 return llvm::is_contained({16, 32, 64},
359 static_cast<int>(intType.getWidth()));
360
361 if (auto vecType = dyn_cast<VectorType>(type)) {
362 Type elementType = vecType.getElementType();
363 if (elementType.isInteger(32))
364 return true;
365
366 if (vecType.getNumElements() == 2 &&
367 (isa<Float16Type, BFloat16Type>(elementType) ||
368 elementType.isInteger(16)))
369 return true;
370 }
371
372 return false;
373}
374
375struct GPUSubgroupBroadcastOpToROCDL
376 : public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
378
379 LogicalResult
380 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
381 ConversionPatternRewriter &rewriter) const override {
382 Value src = adaptor.getSrc();
383 if (isSupportedReadLaneType(src.getType())) {
384 Value result = createReadlaneOp(op, adaptor, rewriter, src);
385 rewriter.replaceOp(op, result);
386 return success();
387 }
388
389 Type i32 = rewriter.getI32Type();
390 Location loc = op.getLoc();
391 SmallVector<Value> decomposed;
392 if (failed(LLVM::decomposeValue(rewriter, loc, src, i32, decomposed,
393 /*permitVariablySizedScalars=*/true)))
394 return rewriter.notifyMatchFailure(op,
395 "Unexpected decomposition failure");
396
397 SmallVector<Value> results;
398 results.reserve(decomposed.size());
399 for (Value v : decomposed)
400 results.emplace_back(createReadlaneOp(op, adaptor, rewriter, v));
401
402 Value result = LLVM::composeValue(rewriter, loc, results, src.getType());
403 rewriter.replaceOp(op, result);
404 return success();
405 }
406
407private:
408 static Value createReadlaneOp(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
409 ConversionPatternRewriter &rewriter,
410 Value src) {
411 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
412 return ROCDL::ReadlaneOp::create(rewriter, op.getLoc(), src.getType(),
413 src, adaptor.getLane());
414 } else { // first_active_lane
415 return ROCDL::ReadfirstlaneOp::create(rewriter, op.getLoc(),
416 src.getType(), src);
417 }
418 }
419};
420
421struct GPUBallotOpToROCDL : public ConvertOpToLLVMPattern<gpu::BallotOp> {
423
424 LogicalResult
425 matchAndRewrite(gpu::BallotOp op, gpu::BallotOp::Adaptor adaptor,
426 ConversionPatternRewriter &rewriter) const override {
427 auto intType = cast<IntegerType>(op.getType());
428 unsigned width = intType.getWidth();
429
430 // ROCDL ballot natively supports i32 and i64 for wavefront sizes of
431 // 32 and 64 lanes.
432 if (width != 32 && width != 64)
433 return rewriter.notifyMatchFailure(
434 op, "rocdl.ballot only supports i32 and i64 result types");
435
436 rewriter.replaceOpWithNewOp<ROCDL::BallotOp>(op, op.getType(),
437 adaptor.getPredicate());
438 return success();
439 }
440};
441
442struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
444
445 /// Lowers a shuffle to the corresponding ROCDL ops.
446 ///
447 /// Use the `width` argument to see if src lane is participating.
448 /// If not the dstLane would be itself.
449 ///
450 /// Shuffle with DS Bpermute:
451 /// let shflMode = [xor, up, down, idx]
452 /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
453 /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
454 /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
455 /// 3. dstLane = shflMode(curLaneId, step)
456 /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
457 /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
458 /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
459 /// 7. bpermute(dwordAlignedDstLane, shfl_value).
460 ///
461 LogicalResult
462 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
463 ConversionPatternRewriter &rewriter) const override {
464 Location loc = op->getLoc();
465 Value initShflValue = adaptor.getValue();
466
467 Value srcLaneId = getLaneId(rewriter, loc);
468
469 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
470 Value width = adaptor.getWidth();
471 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
472 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
473 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
474 Value widthOrZeroIfOutside =
475 LLVM::AndOp::create(rewriter, loc, int32Type, add, negwidth);
476 Value dstLane;
477
478 switch (op.getMode()) {
479 case gpu::ShuffleMode::UP:
480 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
481 adaptor.getOffset());
482 break;
483 case gpu::ShuffleMode::DOWN:
484 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
485 adaptor.getOffset());
486 break;
487 case gpu::ShuffleMode::XOR:
488 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
489 adaptor.getOffset());
490 break;
491 case gpu::ShuffleMode::IDX:
492 dstLane = adaptor.getOffset();
493 break;
494 }
495 Value isActiveSrcLane = LLVM::ICmpOp::create(
496 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
497 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
498 dstLane, srcLaneId);
499 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
500 Value dwordAlignedDstLane =
501 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
502
503 SmallVector<Value> decomposed;
504 if (failed(LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type,
505 decomposed)))
506 return rewriter.notifyMatchFailure(op,
507 "failed to decompose value to i32");
508 SmallVector<Value> swizzled;
509 for (Value v : decomposed) {
510 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
511 dwordAlignedDstLane, v);
512 swizzled.emplace_back(res);
513 }
514 Value shflValue =
515 LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
516 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
517 return success();
518 }
519};
520
521/// Emit an LLVM fence with MMRA metadata based on the given address spaces.
522/// If `addrSpaces` is nullopt, all memory is fenced (global + LDS).
523static void emitFences(std::optional<ArrayAttr> addrSpaces,
524 ConversionPatternRewriter &rewriter, Location loc,
525 StringRef scope, bool before) {
526 bool fenceGlobal = false;
527 bool fenceLDS = false;
528
529 if (addrSpaces) {
530 for (auto spaceAttr : addrSpaces->getAsRange<gpu::AddressSpaceAttr>()) {
531 switch (spaceAttr.getValue()) {
532 case gpu::AddressSpace::Global:
533 fenceGlobal = true;
534 break;
535 case gpu::AddressSpace::Workgroup:
536 fenceLDS = true;
537 break;
538 case gpu::AddressSpace::Private:
539 case gpu::AddressSpace::Constant:
540 break;
541 }
542 }
543 } else {
544 fenceGlobal = true;
545 fenceLDS = true;
546 }
547
548 if (!fenceGlobal && !fenceLDS)
549 return;
550
551 Attribute mmra;
552 if (fenceLDS && !fenceGlobal)
553 mmra =
554 rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
555 else if (fenceGlobal && !fenceLDS)
556 mmra =
557 rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "global");
558
559 auto ordering =
560 before ? LLVM::AtomicOrdering::release : LLVM::AtomicOrdering::acquire;
561 auto fence = LLVM::FenceOp::create(rewriter, loc, ordering, scope);
562 if (mmra)
563 fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
564}
565
566static constexpr int32_t kWholeClusterBarrierId = -3;
567static constexpr int32_t kWholeWorkgroupBarrierId = -1;
568struct GPUBarrierOpLowering final : ConvertOpToLLVMPattern<gpu::BarrierOp> {
569 GPUBarrierOpLowering(const LLVMTypeConverter &converter,
570 amdgpu::Chipset chipset)
571 : ConvertOpToLLVMPattern<gpu::BarrierOp>(converter), chipset(chipset) {}
572
573 amdgpu::Chipset chipset;
574
575 LogicalResult
576 matchAndRewrite(gpu::BarrierOp op, gpu::BarrierOp::Adaptor adaptor,
577 ConversionPatternRewriter &rewriter) const override {
578 Location loc = op.getLoc();
579 gpu::BarrierScope scope = op.getScope();
580
581 // Subgroup (wave) scope.
582 if (scope == gpu::BarrierScope::Subgroup) {
583 emitFences(op.getAddressSpaces(), rewriter, loc, "wavefront",
584 /*before=*/true);
585 ROCDL::WaveBarrierOp::create(rewriter, loc);
586 emitFences(op.getAddressSpaces(), rewriter, loc, "wavefront",
587 /*before=*/false);
588 rewriter.eraseOp(op);
589 return success();
590 }
591
592 // Cluster scope: gfx1250+ only, signal/wait with constant -3.
593 if (scope == gpu::BarrierScope::Cluster) {
594 if (chipset < amdgpu::Chipset(12, 5, 0))
595 return op.emitOpError("cluster scope barriers require gfx1250+");
596 emitFences(op.getAddressSpaces(), rewriter, loc, "cluster",
597 /*before=*/true);
598 ROCDL::BarrierSignalOp::create(rewriter, loc, kWholeClusterBarrierId);
599 ROCDL::BarrierWaitOp::create(
600 rewriter, loc, static_cast<int16_t>(kWholeClusterBarrierId));
601 emitFences(op.getAddressSpaces(), rewriter, loc, "cluster",
602 /*before=*/false);
603 rewriter.eraseOp(op);
604 return success();
605 }
606
607 // Workgroup scope (default).
608 assert(scope == gpu::BarrierScope::Workgroup && "unsupported scope");
609
610 // Named barrier path.
611 if (Value namedBarrier = adaptor.getNamedBarrier()) {
612 if (chipset.majorVersion < 12)
613 return op.emitOpError("named barriers require gfx12+");
614
615 emitFences(op.getAddressSpaces(), rewriter, loc, "workgroup",
616 /*before=*/true);
617 // A wave must join the named barrier before it may signal it.
618 ROCDL::BarrierJoinOp::create(rewriter, loc, namedBarrier);
619 // Signal with memberCnt=0 retains the count from s.barrier.init.
620 ROCDL::BarrierSignalVarOp::create(rewriter, loc, namedBarrier,
621 /*memberCnt=*/0);
622 // id=1 selects the named-barrier wait class; the actual barrier waited
623 // on is the last one this wave joined.
624 ROCDL::BarrierWaitOp::create(rewriter, loc, static_cast<int16_t>(1));
625 emitFences(op.getAddressSpaces(), rewriter, loc, "workgroup",
626 /*before=*/false);
627 rewriter.eraseOp(op);
628 return success();
629 }
630
631 // Regular workgroup barrier.
632 emitFences(op.getAddressSpaces(), rewriter, loc, "workgroup",
633 /*before=*/true);
634 if (chipset.majorVersion < 12) {
635 ROCDL::SBarrierOp::create(rewriter, loc);
636 } else {
637 ROCDL::BarrierSignalOp::create(rewriter, loc, kWholeWorkgroupBarrierId);
638 ROCDL::BarrierWaitOp::create(
639 rewriter, loc, static_cast<int16_t>(kWholeWorkgroupBarrierId));
640 }
641 emitFences(op.getAddressSpaces(), rewriter, loc, "workgroup",
642 /*before=*/false);
643 rewriter.eraseOp(op);
644 return success();
645 }
646};
647
648struct GPUInitializeNamedBarrierOpLowering final
649 : ConvertOpToLLVMPattern<gpu::InitializeNamedBarrierOp> {
650 GPUInitializeNamedBarrierOpLowering(const LLVMTypeConverter &converter,
651 amdgpu::Chipset chipset)
653 chipset(chipset) {}
654
655 amdgpu::Chipset chipset;
656
657 LogicalResult
658 matchAndRewrite(gpu::InitializeNamedBarrierOp op,
659 gpu::InitializeNamedBarrierOp::Adaptor adaptor,
660 ConversionPatternRewriter &rewriter) const override {
661 if (chipset.majorVersion < 12)
662 return op.emitOpError("named barriers require gfx12+");
663
664 Location loc = op.getLoc();
665
666 // The count must be a constant for rocdl.s.barrier.init.
667 IntegerAttr countAttr;
668 if (!matchPattern(op.getMemberCount(), m_Constant(&countAttr)))
669 return op.emitOpError(
670 "named barrier member count must be a constant for ROCDL lowering");
671 int32_t count = countAttr.getInt();
672
673 // Place the global in the symbol-table scope enclosing the function-like
674 // op that contains this barrier (typically a module).
675 auto funcOp = op->getParentOfType<FunctionOpInterface>();
676 if (!funcOp)
677 return op.emitOpError("must be inside a function-like op");
678 Operation *symbolTableOp =
680 if (!symbolTableOp)
681 return op.emitOpError(
682 "enclosing function-like op must have a symbol-table parent");
683
684 auto targetTy = LLVM::LLVMTargetExtType::get(
685 rewriter.getContext(), "amdgcn.named.barrier", {}, {0});
686 auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), 3);
687
688 // Build the global detached so SymbolTable::insert can both place it and
689 // rename it as needed without creating a transient name conflict in IR.
690 OpBuilder detachedBuilder(rewriter.getContext());
691 auto globalOp = LLVM::GlobalOp::create(
692 detachedBuilder, loc, targetTy, /*isConstant=*/false,
693 LLVM::Linkage::Internal, "__named_barrier", /*value=*/Attribute(),
694 /*alignment=*/0, /*addrSpace=*/3);
695 // Initialize with poison.
696 {
697 Region &region = globalOp.getInitializerRegion();
698 Block *block = detachedBuilder.createBlock(&region);
699 detachedBuilder.setInsertionPointToStart(block);
700 auto poison = LLVM::PoisonOp::create(detachedBuilder, loc, targetTy);
701 LLVM::ReturnOp::create(detachedBuilder, loc, poison);
702 }
703 // SymbolTable::insert places the op in the symbol-table body and renames
704 // the symbol to avoid collisions with any existing entries.
705 StringAttr globalName = SymbolTable(symbolTableOp).insert(globalOp);
706
707 // Get address of the global.
708 rewriter.setInsertionPoint(op);
709 auto addrOf = LLVM::AddressOfOp::create(rewriter, loc, ptrTy, globalName);
710
711 // Initialize the barrier.
712 ROCDL::BarrierInitOp::create(rewriter, loc, addrOf, count);
713
714 rewriter.replaceOp(op, addrOf.getResult());
715 return success();
716 }
717};
718
719/// Import the GPU Ops to ROCDL Patterns.
720#include "GPUToROCDL.cpp.inc"
721
722// A pass that replaces all occurrences of GPU device operations with their
723// corresponding ROCDL equivalent.
724//
725// This pass only handles device code and is not meant to be run on GPU host
726// code.
727struct LowerGpuOpsToROCDLOpsPass final
728 : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
729 using Base::Base;
730
731 void getDependentDialects(DialectRegistry &registry) const override {
732 Base::getDependentDialects(registry);
734 }
735
736 void runOnOperation() override {
737 gpu::GPUModuleOp m = getOperation();
738 MLIRContext *ctx = m.getContext();
739
740 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
741 LLVM::LLVMDialect::getDataLayoutAttrName());
742 if (!llvmDataLayout) {
743 llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
744 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
745 }
746 // Request C wrapper emission.
747 for (auto func : m.getOps<func::FuncOp>()) {
748 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
749 UnitAttr::get(ctx));
750 }
751
752 FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
753 if (failed(maybeChipset)) {
754 emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
755 return signalPassFailure();
756 }
757
758 /// Customize the bitwidth used for the device side index computations.
760 ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
761 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
762 if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
763 options.overrideIndexBitwidth(indexBitwidth);
764
765 if (useBarePtrCallConv) {
766 options.useBarePtrCallConv = true;
767 WalkResult canUseBarePointers =
768 m.walk([](gpu::GPUFuncOp func) -> WalkResult {
770 return WalkResult::advance();
771 return WalkResult::interrupt();
772 });
773 if (canUseBarePointers.wasInterrupted()) {
774 emitError(UnknownLoc::get(ctx),
775 "bare pointer calling convention requires all memrefs to "
776 "have static shape and use the identity map");
777 return signalPassFailure();
778 }
779 }
780
781 // Apply in-dialect lowering. In-dialect lowering will replace
782 // ops which need to be lowered further, which is not supported by a
783 // single conversion pass.
784 {
785 RewritePatternSet patterns(ctx);
787 populateGpuPromoteShuffleToAMDGPUPatterns(patterns, maybeChipset);
788 (void)applyPatternsGreedily(m, std::move(patterns));
789 }
790
791 LLVMTypeConverter converter(ctx, options);
793
794 RewritePatternSet llvmPatterns(ctx);
796
797 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
798 allowedDialects.end());
799 for (Dialect *dialect : ctx->getLoadedDialects()) {
800 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
801 // Empty `allowedDialectsSet` means all dialects are allowed.
802 if (!allowedDialectsSet.empty() && !allowed)
803 continue;
804
805 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
806 if (!iface) {
807 // Error out if dialect was explicily specified but doesn't implement
808 // conversion interface.
809 if (allowed) {
810 m.emitError()
811 << "dialect does not implement ConvertToLLVMPatternInterface: "
812 << dialect->getNamespace();
813 return signalPassFailure();
814 }
815 continue;
816 }
817
818 iface->populateConvertToLLVMConversionPatterns(target, converter,
819 llvmPatterns);
820 }
821
822 populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
823 *maybeChipset);
824 populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
825 *maybeChipset);
827 if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
828 signalPassFailure();
829 auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
830 auto reqdWorkGroupSizeAttrHelper =
831 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
832 auto flatWorkGroupSizeAttrHelper =
833 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
834 // Manually rewrite known block size attributes so the LLVMIR translation
835 // infrastructure can pick them up.
836 m.walk([&](LLVM::LLVMFuncOp op) {
837 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
838 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
839 // Also set up the rocdl.flat_work_group_size attribute to prevent
840 // conflicting metadata.
841 uint32_t flatSize = 1;
842 for (uint32_t size : blockSizes.asArrayRef()) {
843 flatSize *= size;
844 }
845 StringAttr flatSizeAttr =
846 StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
847 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
848 }
849 });
850 }
851};
852
853} // namespace
854
856 target.addIllegalOp<func::FuncOp>();
857 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
858 target.addLegalDialect<ROCDL::ROCDLDialect>();
859 target.addIllegalDialect<gpu::GPUDialect>();
860 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
861 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
862 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
863 // These ops are legal for f32 type.
864 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
865 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
866 });
867 // TODO: Remove once we support replacing non-root ops.
868 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
869}
870
872 const LLVMTypeConverter &converter, RewritePatternSet &patterns,
877 auto *rocdlDialect =
878 converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
879 populateWithGenerated(patterns);
880 patterns.add<
881 gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
882 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
883 converter, IndexKind::Block, IntrType::Id);
885 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
886 converter, IndexKind::Grid, IntrType::Id);
887 patterns.add<GPUDimOpToOcklCall<gpu::BlockDimOp>>(converter,
888 IndexKind::Block);
889 patterns.add<GPUDimOpToOcklCall<gpu::GridDimOp>>(converter, IndexKind::Grid);
890 patterns.add<GPUReturnOpLowering>(converter);
891 patterns.add<GPUFuncOpLowering>(
892 converter,
894 /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
895 /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
896 rocdlDialect->getKernelAttrHelper().getName(),
897 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
898 /*kernelClusterSizeAttributeName=*/{}});
899 if (Runtime::HIP == runtime) {
900 patterns.add<GPUPrintfOpToHIPLowering>(converter);
901 } else if (Runtime::OpenCL == runtime) {
902 // Use address space = 4 to match the OpenCL definition of printf()
903 patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
904 }
905 // TODO: Add alignment for workgroup memory
906 patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
907
908 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
909 GPUSubgroupBroadcastOpToROCDL, GPUBallotOpToROCDL>(converter);
910 patterns.add<GPUSubgroupIdOpToROCDL, GPUSubgroupSizeOpToROCDL,
911 GPUBarrierOpLowering, GPUInitializeNamedBarrierOpLowering>(
912 converter, chipset);
913
914 populateMathToROCDLConversionPatterns(converter, patterns, chipset);
915}
return success()
b getContext())
static Value getLaneId(RewriterBase &rewriter, Location loc)
static constexpr int64_t kMaxThreadsPerBlockDim
Maximum number of threads per block dimension on AMD GPUs.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value getKnownOrOcklDim(RewriterBase &rewriter, gpu::index_lowering::IndexKind indexKind, gpu::Dimension dim, Operation *contextOp, std::optional< uint32_t > opUpperBound)
Emits a call to an OCKL block/grid size function corresponding to indexKind with argument dim,...
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
#define add(a, b)
Attributes are known-constant values of operations.
Definition Attributes.h:25
Block represents an ordered list of Operations.
Definition Block.h:33
UnitAttr getUnitAttr()
Definition Builders.cpp:102
IntegerAttr getI64IntegerAttr(int64_t value)
Definition Builders.cpp:116
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
Definition Builders.cpp:271
MLIRContext * getContext() const
Definition Builders.h:56
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
Definition Builders.cpp:108
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Definition Builders.cpp:98
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition Pattern.h:227
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
Definition Pattern.h:233
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Definition Dialect.h:38
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
Definition Attributes.h:164
This class helps build Operations.
Definition Builders.h:209
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes={}, ArrayRef< Location > locs={})
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
Definition Builders.cpp:435
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition Builders.h:433
A trait used to provide symbol table functionalities to a region operation.
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87
Operation * getParentWithTrait()
Returns the closest surrounding parent operation with trait Trait.
Definition Operation.h:273
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:240
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition Operation.h:607
MLIRContext * getContext()
Return the context this operation is associated with.
Definition Operation.h:233
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition Region.h:26
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Definition SymbolTable.h:24
StringAttr insert(Operation *symbol, Block::iterator insertPt={})
Insert a new symbol into the table, and rename it as necessary to avoid collisions.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
bool isInteger() const
Return true if this is an integer type (with the specified width).
Definition Types.cpp:58
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:389
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
A utility result that is used to signal how to proceed with an ongoing walk:
Definition WalkResult.h:29
static WalkResult advance()
Definition WalkResult.h:47
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition WalkResult.h:51
static WalkResult interrupt()
Definition WalkResult.h:46
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
Definition ArithOps.cpp:283
LogicalResult decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType, SmallVectorImpl< Value > &result, bool permitVariablySizedScalars=false)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Definition Pattern.cpp:495
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
Definition Pattern.cpp:594
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Runtime
Potential runtimes for AMD GPU kernels.
Definition Runtimes.h:15
LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim, std::optional< uint32_t > opUpperBound, IndexKind indexKind, IntrType intrType, unsigned bitWidth)
Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds are found.
std::optional< uint32_t > getKnownDimensionSizeAround(Operation *op, DimensionKind kind, Dimension dim)
Retrieve the constant bounds for a given dimension and dimension kind from the context surrounding op...
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition Matchers.h:490
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, std::optional< amdgpu::Chipset > chipset)
Populate the given list with patterns that convert from Math to ROCDL calls.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition Passes.h:91
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Note that these functions don't take a SymbolTable because GPU module lowerings can have name collisi...
void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces and types,...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition Matchers.h:369
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition Chipset.h:22
unsigned majorVersion
Definition Chipset.h:23
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition Chipset.cpp:14