MLIR 23.0.0git
GPUTransformOps.cpp
Go to the documentation of this file.
1//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10
32#include "mlir/IR/AffineExpr.h"
33#include "mlir/IR/Builders.h"
35#include "mlir/IR/IRMapping.h"
36#include "mlir/IR/MLIRContext.h"
38#include "mlir/IR/Visitors.h"
39#include "mlir/Support/LLVM.h"
41#include "llvm/ADT/STLExtras.h"
42#include "llvm/ADT/SmallVector.h"
43#include "llvm/ADT/TypeSwitch.h"
44#include "llvm/Support/DebugLog.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/InterleavedRange.h"
47#include "llvm/Support/LogicalResult.h"
48#include <optional>
49#include <type_traits>
50
51using namespace mlir;
52using namespace mlir::gpu;
53using namespace mlir::transform;
54using namespace mlir::transform::gpu;
55
56#define DEBUG_TYPE "gpu-transforms"
57
58//===----------------------------------------------------------------------===//
59// Apply...ConversionPatternsOp
60//===----------------------------------------------------------------------===//
61
62void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
63 TypeConverter &typeConverter, RewritePatternSet &patterns) {
64 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
65 configureGpuToNVVMTypeConverter(llvmTypeConverter);
66 // Set higher benefit, so patterns will run before generic LLVM lowering.
67 populateGpuToNVVMConversionPatterns(llvmTypeConverter, patterns,
68 getBenefit());
69}
70
71LogicalResult
72transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
73 transform::TypeConverterBuilderOpInterface builder) {
74 if (builder.getTypeConverterType() != "LLVMTypeConverter")
75 return emitOpError("expected LLVMTypeConverter");
76 return success();
77}
78
79void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
80 TypeConverter &typeConverter, RewritePatternSet &patterns) {
81 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
82 populateGpuWMMAToNVVMConversionPatterns(llvmTypeConverter, patterns);
83}
84
85LogicalResult
86transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
87 transform::TypeConverterBuilderOpInterface builder) {
88 if (builder.getTypeConverterType() != "LLVMTypeConverter")
89 return emitOpError("expected LLVMTypeConverter");
90 return success();
91}
92
93void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
94 populatePatterns(TypeConverter &typeConverter,
95 RewritePatternSet &patterns) {
96 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
97 populateGpuSubgroupReduceOpLoweringPattern(llvmTypeConverter, patterns);
98}
99
100LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
101 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
102 if (builder.getTypeConverterType() != "LLVMTypeConverter")
103 return emitOpError("expected LLVMTypeConverter");
104 return success();
105}
106
107void transform::ApplyGPUToROCDLConversionPatternsOp::populatePatterns(
108 TypeConverter &typeConverter, RewritePatternSet &patterns) {
109 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
111 FailureOr<amdgpu::Chipset> maybeChipset =
112 amdgpu::Chipset::parse(getChipset());
113 assert(llvm::succeeded(maybeChipset) && "expected valid chipset");
115 llvmTypeConverter, patterns, mlir::gpu::amd::Runtime::HIP, *maybeChipset);
116}
117
118LogicalResult
119transform::ApplyGPUToROCDLConversionPatternsOp::verifyTypeConverter(
120 transform::TypeConverterBuilderOpInterface builder) {
121 FailureOr<amdgpu::Chipset> maybeChipset =
122 amdgpu::Chipset::parse(getChipset());
123 if (failed(maybeChipset)) {
124 return emitOpError("Invalid chipset name: " + getChipset());
125 }
126 if (builder.getTypeConverterType() != "LLVMTypeConverter")
127 return emitOpError("expected LLVMTypeConverter");
128 return success();
129}
130
131//===----------------------------------------------------------------------===//
132// Apply...PatternsOp
133//===----------------------------------------------------------------------===//s
134
135void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {
137}
138
139void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
140 RewritePatternSet &patterns) {
141 std::optional<StringRef> chipsetName = getChipset();
142 std::optional<amdgpu::Chipset> maybeChipset;
143 if (chipsetName) {
144 FailureOr<amdgpu::Chipset> parsedChipset =
145 amdgpu::Chipset::parse(*chipsetName);
146 assert(llvm::succeeded(parsedChipset) && "expected valid chipset");
147 maybeChipset = parsedChipset;
148 }
149
150 populateGpuPromoteShuffleToAMDGPUPatterns(patterns, maybeChipset);
151}
152
153//===----------------------------------------------------------------------===//
154// ApplyUnrollVectorsSubgroupMmaOp
155//===----------------------------------------------------------------------===//
156
157/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
158/// register.
159static std::optional<SmallVector<int64_t>>
160gpuMmaUnrollOrder(vector::ContractionOp contract) {
162 // First make reduction the outer dimensions.
163 for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
164 if (vector::isReductionIterator(iter)) {
165 order.push_back(index);
166 }
167 }
168
169 llvm::SmallDenseSet<int64_t> dims;
170 for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
171 dims.insert(cast<AffineDimExpr>(expr).getPosition());
172 }
173 // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
174 for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
175 if (vector::isParallelIterator(iter) && dims.count(index)) {
176 order.push_back(index);
177 }
178 }
179 // Then the remaining parallel loops.
180 for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
181 if (vector::isParallelIterator(iter) && !dims.count(index)) {
182 order.push_back(index);
183 }
184 }
185 return order;
186}
187
188/// Returns the target vector size for the target operation based on the native
189/// vector size specified with `m`, `n`, and `k`.
190static std::optional<SmallVector<int64_t>>
192 if (auto contract = dyn_cast<vector::ContractionOp>(op)) {
193 int64_t contractRank = contract.getIteratorTypes().size();
194 if (contractRank < 3)
195 return std::nullopt;
196 SmallVector<int64_t> nativeSize(contractRank - 3, 1);
197 nativeSize.append({m, n, k});
198 return nativeSize;
199 }
200 if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
201 int64_t writeRank = writeOp.getVectorType().getRank();
202 if (writeRank < 2)
203 return std::nullopt;
204 SmallVector<int64_t> nativeSize(writeRank - 2, 1);
205 nativeSize.append({m, n});
206 return nativeSize;
207 }
208 if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
209 // Transfer read ops may need different shapes based on how they are being
210 // used. For simplicity just match the shape used by the extract strided op.
211 VectorType sliceType;
212 for (Operation *users : op->getUsers()) {
213 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
214 if (!extract)
215 return std::nullopt;
216 auto vecType = cast<VectorType>(extract.getResult().getType());
217 if (sliceType && sliceType != vecType)
218 return std::nullopt;
219 sliceType = vecType;
220 }
221 return llvm::to_vector(sliceType.getShape());
222 }
224 if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {
225 // TODO: The condition for unrolling elementwise should be restricted
226 // only to operations that need unrolling (connected to the contract).
227 if (vecType.getRank() < 2)
228 return std::nullopt;
229
230 // First check whether there is a slice to infer the shape from. This is
231 // required for cases where the accumulator type differs from the input
232 // types, in which case we will see an `arith.ext_` between the contract
233 // and transfer_read which needs to be unrolled.
234 VectorType sliceType;
235 for (Operation *users : op->getUsers()) {
236 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
237 if (!extract)
238 return std::nullopt;
239 auto vecType = cast<VectorType>(extract.getResult().getType());
240 if (sliceType && sliceType != vecType)
241 return std::nullopt;
242 sliceType = vecType;
243 }
244 if (sliceType)
245 return llvm::to_vector(sliceType.getShape());
246
247 // Else unroll for trailing elementwise.
248 SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1);
249 // Map elementwise ops to the output shape.
250 nativeSize.append({m, n});
251 return nativeSize;
252 }
253 }
254 return std::nullopt;
255}
256
257void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
258 RewritePatternSet &patterns) {
259 auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
260 auto contract = dyn_cast<vector::ContractionOp>(op);
261 if (!contract)
262 return std::nullopt;
264 };
265
266 int64_t m = getM();
267 int64_t n = getN();
268 int64_t k = getK();
269 auto nativeShapeFn =
270 [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> {
271 return getSubgroupMmaNativeVectorSize(op, m, n, k);
272 };
273 vector::populateVectorUnrollPatterns(
275 .setNativeShapeFn(nativeShapeFn)
276 .setUnrollTraversalOrderFn(unrollOrder));
277}
278
279//===----------------------------------------------------------------------===//
280// EliminateBarriersOp
281//===----------------------------------------------------------------------===//
282
283void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
285}
286
287//===----------------------------------------------------------------------===//
288// Block and thread mapping utilities.
289//===----------------------------------------------------------------------===//
290
291namespace {
292/// Local types used for mapping verification.
293struct MappingKind {};
294struct BlockMappingKind : MappingKind {};
295struct ThreadMappingKind : MappingKind {};
296} // namespace
297
299definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
300 Operation *target, const Twine &message) {
301 if (transformOp.has_value())
302 return transformOp->emitDefiniteFailure() << message;
303 return emitDefiniteFailure(target, message);
304}
305
306/// Check if given mapping attributes are one of the desired attributes
307template <typename MappingKindType>
309checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
310 scf::ForallOp forallOp) {
311 if (!forallOp.getMapping().has_value()) {
312 return definiteFailureHelper(transformOp, forallOp,
313 "scf.forall op requires a mapping attribute");
314 }
315
316 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
317 llvm::IsaPred<GPUBlockMappingAttr>);
318 bool hasWarpgroupMapping = llvm::any_of(
319 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
320 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
321 llvm::IsaPred<GPUWarpMappingAttr>);
322 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
323 llvm::IsaPred<GPUThreadMappingAttr>);
324 bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
325 llvm::IsaPred<GPULaneMappingAttr>);
326 int64_t countMappingTypes = 0;
327 countMappingTypes += hasBlockMapping ? 1 : 0;
328 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
329 countMappingTypes += hasWarpMapping ? 1 : 0;
330 countMappingTypes += hasThreadMapping ? 1 : 0;
331 countMappingTypes += hasLaneMapping ? 1 : 0;
332 if (countMappingTypes > 1) {
334 transformOp, forallOp,
335 "cannot mix different mapping types, use nesting");
336 }
337 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
338 !hasBlockMapping) {
340 transformOp, forallOp,
341 "scf.forall op requires a mapping attribute of kind 'block'");
342 }
343 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
344 !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
345 !hasWarpgroupMapping) {
346 return definiteFailureHelper(transformOp, forallOp,
347 "scf.forall op requires a mapping attribute "
348 "of kind 'thread' or 'warp'");
349 }
350
352 for (Attribute map : forallOp.getMapping()->getValue()) {
353 if (seen.contains(map)) {
355 transformOp, forallOp,
356 "duplicate attribute, cannot map different loops "
357 "to the same mapping id");
358 }
359 seen.insert(map);
360 }
361
362 auto isLinear = [](DeviceMappingAttrInterface attr) {
363 return attr.isLinearMapping();
364 };
365 if (llvm::any_of(forallOp.getDeviceMappingAttrs(), isLinear) &&
366 !llvm::all_of(forallOp.getDeviceMappingAttrs(), isLinear)) {
368 transformOp, forallOp,
369 "cannot mix linear and non-linear mapping modes");
370 }
371
372 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
373 forallOp.getDeviceMaskingAttr();
374 if (succeeded(maybeMaskingAttr) && *maybeMaskingAttr &&
375 !forallOp.usesLinearMapping()) {
377 transformOp, forallOp,
378 "device masking is only available in linear mapping mode");
379 }
380
382}
383
384template <typename MappingKindType>
386verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
387 scf::ForallOp forallOp) {
388 // Check the types of the mapping attributes match.
390 checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
391 if (!typeRes.succeeded())
392 return typeRes;
393
394 // Perform other non-types verifications.
395 if (!forallOp.isNormalized())
396 return definiteFailureHelper(transformOp, forallOp,
397 "unsupported non-normalized loops");
398 if (forallOp.getNumResults() > 0)
399 return definiteFailureHelper(transformOp, forallOp,
400 "only bufferized scf.forall can be mapped");
401 bool useLinearMapping = forallOp.usesLinearMapping();
402 // TODO: This would be more natural with support for Optional<EnumParameter>
403 // in GPUDeviceMappingAttr.
404 int64_t maxNumMappingsSupported =
405 useLinearMapping ? (getMaxEnumValForMappingId() -
406 static_cast<uint64_t>(MappingId::DimZ))
407 : 3;
408 if (forallOp.getRank() > maxNumMappingsSupported) {
409 return definiteFailureHelper(transformOp, forallOp,
410 "scf.forall with rank > ")
411 << maxNumMappingsSupported
412 << " does not lower for the specified mapping attribute type";
413 }
414 auto numParallelIterations =
415 getConstantIntValues(forallOp.getMixedUpperBound());
416 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
418 transformOp, forallOp,
419 "requires statically sized, normalized forall op");
420 }
422}
423
424/// Struct to return the result of the rewrite of a forall operation.
429
430/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
431template <typename OpTy, typename OperationOrBlock>
432static void
434 OperationOrBlock *parent, Value replacement,
435 ArrayRef<int64_t> availableMappingSizes) {
436 parent->walk([&](OpTy idOp) {
437 if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
438 rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
439 });
440}
441
443 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
444 scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
445 ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {
446 LDBG() << "--start rewriteOneForallCommonImpl";
447
448 // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
449 auto numParallelIterations =
450 getConstantIntValues(forallOp.getMixedUpperBound());
451 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
452 "requires statically sized, normalized forall op");
453 SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value();
454 SmallVector<DeviceMappingAttrInterface> forallMappingAttrsVec =
455 forallOp.getDeviceMappingAttrs();
456 SetVector<Attribute> forallMappingAttrs;
457 forallMappingAttrs.insert_range(forallMappingAttrsVec);
458 auto comparator = [](Attribute a, Attribute b) -> bool {
459 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
460 cast<DeviceMappingAttrInterface>(b).getMappingId();
461 };
462
463 // Step 1.b. In the linear case, compute the max mapping to avoid needlessly
464 // mapping all dimensions. In the 3-D mapping case we need to map all
465 // dimensions.
466 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
467 *llvm::max_element(forallMappingAttrs, comparator));
468 DeviceMappingAttrInterface maxLinearMapping;
469 if (maxMapping.isLinearMapping())
470 maxLinearMapping = maxMapping;
471 for (auto attr : gpuIdBuilder.mappingAttributes) {
472 // If attr overflows, just skip.
473 if (maxLinearMapping && comparator(maxLinearMapping, attr))
474 continue;
475 // Try to insert. If element was already present, just continue.
476 if (!forallMappingAttrs.insert(attr))
477 continue;
478 // Otherwise, we have a new insertion without a size -> use size 1.
479 tmpMappingSizes.push_back(1);
480 }
481 LDBG() << "----tmpMappingSizes extracted from scf.forall op: "
482 << llvm::interleaved(tmpMappingSizes);
483
484 // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
485 SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(
486 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
487 LDBG() << "----forallMappingSizes: " << llvm::interleaved(forallMappingSizes);
488 LDBG() << "----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs);
489
490 // Step 3. Generate the mappingIdOps using the provided generator.
491 Location loc = forallOp.getLoc();
492 OpBuilder::InsertionGuard guard(rewriter);
493 rewriter.setInsertionPoint(forallOp);
494 SmallVector<int64_t> originalBasis(availableMappingSizes);
495 bool originalBasisWasProvided = !originalBasis.empty();
496 if (!originalBasisWasProvided) {
497 LDBG() << "----originalBasis was not provided, deriving it and there will "
498 "be no "
499 "predication";
500 originalBasis = forallMappingSizes;
501 while (originalBasis.size() < 3)
502 originalBasis.push_back(1);
503 } else {
504 LDBG() << "----originalBasis was provided, using it, there will be "
505 "predication";
506 }
507 LDBG() << "------originalBasis: " << llvm::interleaved(originalBasis);
508
509 IdBuilderResult builderResult =
510 gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
511 if (!builderResult.errorMsg.empty())
512 return definiteFailureHelper(transformOp, forallOp, builderResult.errorMsg);
513
514 LDBG() << builderResult;
515
516 // Step 4. Map the induction variables to the mappingIdOps, this may involve
517 // a permutation.
518 SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
519 IRMapping bvm;
520 for (auto [iv, dim] : llvm::zip_equal(
521 forallOp.getInductionVars(),
522 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
523 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
524 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
525 LDBG() << "----map: " << iv << " to " << peIdOp;
526 bvm.map(iv, peIdOp);
527 }
528
529 // Step 5. If the originalBasis is already known, create conditionals to
530 // predicate the region. Otherwise, the current forall determines the
531 // originalBasis and no predication occurs.
532 Value predicate;
533 if (originalBasisWasProvided) {
534 for (Value tmpPredicate : builderResult.predicateOps) {
535 predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,
536 tmpPredicate)
537 : tmpPredicate;
538 }
539 }
540
541 // Step 6. Move the body of forallOp.
542 // Erase the terminator first, it will not be used.
543 rewriter.eraseOp(forallOp.getTerminator());
544 Block *targetBlock;
545 Block::iterator insertionPoint;
546 if (predicate) {
547 // Step 6.a. If predicated, move at the beginning.
548 auto ifOp = scf::IfOp::create(rewriter, loc, predicate,
549 /*withElseRegion=*/false);
550 targetBlock = ifOp.thenBlock();
551 insertionPoint = ifOp.thenBlock()->begin();
552 } else {
553 // Step 6.b. Otherwise, move inline just at the rewriter insertion
554 // point.
555 targetBlock = forallOp->getBlock();
556 insertionPoint = rewriter.getInsertionPoint();
557 }
558 Block &sourceBlock = forallOp.getRegion().front();
559 targetBlock->getOperations().splice(insertionPoint,
560 sourceBlock.getOperations());
561
562 // Step 7. RAUW indices.
563 for (Value loopIndex : forallOp.getInductionVars()) {
564 Value threadIdx = bvm.lookup(loopIndex);
565 rewriter.replaceAllUsesWith(loopIndex, threadIdx);
566 }
567
568 // Step 8. Erase old op.
569 rewriter.eraseOp(forallOp);
570
571 LDBG() << "----result forallMappingSizes: "
572 << llvm::interleaved(forallMappingSizes);
573 LDBG() << "----result mappingIdOps: " << llvm::interleaved(mappingIdOps);
574
575 result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
577}
578
579//===----------------------------------------------------------------------===//
580// MapForallToBlocks
581//===----------------------------------------------------------------------===//
582
584 RewriterBase &rewriter, TransformOpInterface transformOp,
585 scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
586 const GpuIdBuilder &gpuIdBuilder) {
587 LDBG() << "Start mapForallToBlocksImpl";
588
589 {
590 // GPU-specific verifications. There is no better place to anchor
591 // those right now: the ForallOp is target-independent and the transform
592 // op does not apply to individual ForallOp.
594 verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
595 if (!diag.succeeded())
596 return diag;
597 }
598
599 Location loc = forallOp.getLoc();
600 Block *parentBlock = forallOp->getBlock();
601 Value zero;
602 {
603 // Create an early zero index value for replacements and immediately reset
604 // the insertion point.
605 OpBuilder::InsertionGuard guard(rewriter);
606 rewriter.setInsertionPointToStart(parentBlock);
607 zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
608 }
609
610 ForallRewriteResult rewriteResult;
612 rewriter, transformOp, forallOp,
613 /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder);
614
615 // Return if anything goes wrong, use silenceable failure as a match
616 // failure.
617 if (!diag.succeeded())
618 return diag;
619
620 // If gridDims was not provided already, set it from the return.
621 if (gridDims.empty()) {
622 gridDims = rewriteResult.mappingSizes;
623 while (gridDims.size() < 3)
624 gridDims.push_back(1);
625 }
626 assert(gridDims.size() == 3 && "Need 3-D gridDims");
627
628 // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
629 // Here, the result of mapping determines the available mapping sizes.
630 replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
631 rewriteResult.mappingSizes);
632
634}
635
638 scf::ForallOp &topLevelForallOp,
639 TransformOpInterface transformOp) {
640 auto walkResult = target->walk([&](scf::ForallOp forallOp) {
641 if (forallOp->getParentOfType<scf::ForallOp>())
642 return WalkResult::advance();
643 if (topLevelForallOp)
644 // TODO: Handle multiple forall if they are independent.
645 return WalkResult::interrupt();
646 topLevelForallOp = forallOp;
647 return WalkResult::advance();
648 });
649
650 if (walkResult.wasInterrupted() || !topLevelForallOp)
651 return transformOp.emitSilenceableError()
652 << "could not find a unique topLevel scf.forall";
654}
655
656DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
659 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
660 auto transformOp = cast<TransformOpInterface>(getOperation());
661
662 if (!getGenerateGpuLaunch() && !gpuLaunch) {
664 emitSilenceableError()
665 << "Given target is not gpu.launch, set `generate_gpu_launch` "
666 "attribute";
667 diag.attachNote(target->getLoc()) << "when applied to this payload op";
668 return diag;
669 }
670
671 scf::ForallOp topLevelForallOp;
673 target, topLevelForallOp, transformOp);
674 if (!diag.succeeded()) {
675 diag.attachNote(target->getLoc()) << "when applied to this payload op";
676 return diag;
677 }
678 assert(topLevelForallOp && "expect an scf.forall");
679
680 SmallVector<int64_t> gridDims{getGridDims()};
681 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
682 return transformOp.emitDefiniteFailure("transform require size-3 mapping");
683
684 OpBuilder::InsertionGuard guard(rewriter);
685 rewriter.setInsertionPoint(topLevelForallOp);
686
687 // Generate gpu launch here and move the forall inside
688 if (getGenerateGpuLaunch()) {
690 createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
691 if (!diag.succeeded())
692 return diag;
693
694 rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
695 Operation *newForallOp = rewriter.clone(*topLevelForallOp);
696 rewriter.eraseOp(topLevelForallOp);
697 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
698 }
699
700 // The BlockIdBuilder adapts to whatever is thrown at it.
701 bool useLinearMapping = false;
702 if (topLevelForallOp.getMapping())
703 useLinearMapping = topLevelForallOp.usesLinearMapping();
704
705 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
706 topLevelForallOp.getDeviceMaskingAttr();
707 assert(succeeded(maybeMaskingAttr) && "unexpected failed maybeMaskingAttr");
708 assert((!*maybeMaskingAttr || useLinearMapping) &&
709 "masking requires linear mapping");
710
711 GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping,
712 *maybeMaskingAttr);
713
715 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
716 if (!diag.succeeded())
717 return diag;
718
719 // Set the GPU launch configuration for the grid dims late, this is
720 // subject to IR inspection.
721 diag = alterGpuLaunch(rewriter, gpuLaunch,
722 cast<TransformOpInterface>(getOperation()), gridDims[0],
723 gridDims[1], gridDims[2]);
724
725 results.push_back(gpuLaunch);
726 return diag;
727}
728
729LogicalResult transform::MapForallToBlocks::verify() {
730 if (!getGridDims().empty() && getGridDims().size() != 3) {
731 return emitOpError() << "transform requires empty or size-3 grid_dims";
732 }
733 return success();
734}
735
736//===----------------------------------------------------------------------===//
737// MapNestedForallToThreads
738//===----------------------------------------------------------------------===//
739
741 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
742 ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes,
743 int factor, bool useLinearMapping = false) {
744 if (llvm::any_of(blockOrGridSizes, [](int64_t i) { return i <= 0; })) {
745 return definiteFailureHelper(transformOp, forallOp,
746 "block/grid sizes must be strictly positive");
747 }
748 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
750 transformOp, forallOp,
751 Twine("3-D mapping: size of threadIdx.x must be a multiple of ") +
752 Twine(factor));
753 return diag;
754 }
755 bool hasZeroParallelIteration =
756 llvm::any_of(numParallelIterations, [](int64_t i) { return i == 0; });
757 // `computeProduct` requires strictly positive inputs, so handle the
758 // zero-iteration case explicitly to avoid asserting on valid degenerate
759 // loop bounds.
760 int64_t requiredResourceCount =
761 hasZeroParallelIteration ? 0
762 : computeProduct(numParallelIterations) * factor;
763 int64_t availableResourceCount = computeProduct(blockOrGridSizes);
764 if (requiredResourceCount > availableResourceCount) {
766 transformOp, forallOp,
767 Twine("the number of required parallel resources (blocks or "
768 "threads) ") +
769 Twine(requiredResourceCount) +
770 " overflows the number of available resources " +
771 Twine(availableResourceCount));
772 return diag;
773 }
775}
776
778getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
779 scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes,
780 int64_t warpSize, GpuIdBuilder &gpuIdBuilder) {
781 DeviceMappingAttrInterface mappingAttr =
782 forallOp.getDeviceMappingAttrs().front();
783 bool useLinearMapping = mappingAttr.isLinearMapping();
784
785 // Sanity checks that may result in runtime verification errors.
786 auto numParallelIterations =
787 getConstantIntValues((forallOp.getMixedUpperBound()));
788 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
790 transformOp, forallOp,
791 "requires statically sized, normalized forall op");
792 }
793 int64_t factor = 1;
794 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
796 } else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
797 factor = warpSize;
798 }
800 checkMappingSpec(transformOp, forallOp, numParallelIterations.value(),
801 blockSizes, factor, useLinearMapping);
802 if (!diag.succeeded())
803 return diag;
804
805 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
806 forallOp.getDeviceMaskingAttr();
807 assert(succeeded(maybeMaskingAttr) && "unexpected failed maybeMaskingAttr");
808 assert((!*maybeMaskingAttr || useLinearMapping) &&
809 "masking requires linear mapping");
810
811 // Start mapping.
812 MLIRContext *ctx = forallOp.getContext();
813 gpuIdBuilder =
815 .Case([&](GPUWarpgroupMappingAttr) {
816 return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping,
817 *maybeMaskingAttr);
818 })
819 .Case([&](GPUWarpMappingAttr) {
820 return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping,
821 *maybeMaskingAttr);
822 })
823 .Case([&](GPUThreadMappingAttr) {
824 return GpuThreadIdBuilder(ctx, useLinearMapping, *maybeMaskingAttr);
825 })
826 .Case([&](GPULaneMappingAttr) {
827 return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping,
828 *maybeMaskingAttr);
829 })
830 .DefaultUnreachable("unknown mapping attribute");
832}
833
835 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
836 scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,
837 bool syncAfterDistribute) {
838
839 {
840 // GPU-specific verifications. There is no better place to anchor
841 // those right now: the ForallOp is target-independent and the transform
842 // op does not apply to individual ForallOp.
844 verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
845 if (!diag.succeeded())
846 return diag;
847 }
848
849 GpuIdBuilder gpuIdBuilder;
850 {
851 // Try to construct the id builder, if it fails, return.
853 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
854 if (!diag.succeeded())
855 return diag;
856 }
857
858 Location loc = forallOp.getLoc();
859 OpBuilder::InsertionGuard g(rewriter);
860 // Insert after to allow for syncthreads after `forall` is erased.
861 rewriter.setInsertionPointAfter(forallOp);
862 ForallRewriteResult rewriteResult;
864 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
865 if (!diag.succeeded())
866 return diag;
867 // Add a syncthreads if needed. TODO: warpsync
868 if (syncAfterDistribute)
869 BarrierOp::create(rewriter, loc);
870
872}
873
875 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
876 Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize,
877 bool syncAfterDistribute) {
878 LDBG() << "Start mapNestedForallToThreadsImpl";
879 if (blockDims.size() != 3) {
880 return definiteFailureHelper(transformOp, target,
881 "requires size-3 thread mapping");
882 }
883
884 // Create an early zero index value for replacements.
885 Location loc = target->getLoc();
886 Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
888 WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
890 rewriter, transformOp, forallOp, blockDims, warpSize,
891 syncAfterDistribute);
892 if (diag.isDefiniteFailure())
893 return WalkResult::interrupt();
894 if (diag.succeeded())
895 return WalkResult::skip();
896 return WalkResult::advance();
897 });
898 if (walkResult.wasInterrupted())
899 return diag;
900
901 // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
902 // Here, the result of mapping determines the available mapping sizes.
904 blockDims);
905
907}
908
909DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
911 ApplyToEachResultList &results, TransformState &state) {
912 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
913 auto transformOp = cast<TransformOpInterface>(getOperation());
914
915 // Basic high-level verifications.
916 if (!gpuLaunch)
917 return emitSilenceableError() << "Given target is not a gpu.launch";
918
919 // Mapping to block ids.
920 SmallVector<int64_t> blockDims{getBlockDims()};
922 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
923 blockDims[0], blockDims[1], blockDims[2]);
924 if (diag.isSilenceableFailure()) {
925 diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
926 return diag;
927 }
928
929 // Set the GPU launch configuration for the block dims early, this is not
930 // subject to IR inspection.
931 diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
932 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
933 blockDims[2]);
934
935 rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
936 diag =
937 mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
938 getWarpSize(), getSyncAfterDistribute());
939
940 results.push_back(gpuLaunch.getOperation());
941 return diag;
942}
943
944//===----------------------------------------------------------------------===//
945// Transform op registration
946//===----------------------------------------------------------------------===//
947
948namespace {
949/// Registers new ops and declares PDL as dependent dialect since the
950/// additional ops are using PDL types for operands and results.
951class GPUTransformDialectExtension
953 GPUTransformDialectExtension> {
954public:
955 MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUTransformDialectExtension)
956
957 GPUTransformDialectExtension() {
958 declareGeneratedDialect<GPUDialect>();
959 declareGeneratedDialect<amdgpu::AMDGPUDialect>();
960 declareGeneratedDialect<arith::ArithDialect>();
961 declareGeneratedDialect<scf::SCFDialect>();
962 registerTransformOps<
963#define GET_OP_LIST
964#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
965 >();
966 }
967};
968} // namespace
969
970#define GET_OP_CLASSES
971#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
972
974 registry.addExtensions<GPUTransformDialectExtension>();
975}
return success()
p<< " : "<< getMemRefType()<< ", "<< getType();}static LogicalResult verifyVectorMemoryOp(Operation *op, MemRefType memrefType, VectorType vectorType) { if(memrefType.getElementType() !=vectorType.getElementType()) return op-> emitOpError("requires memref and vector types of the same elemental type")
Given a list of lists of parsed operands, populates uniqueOperands with unique operands.
static DiagnosedSilenceableFailure checkMappingAttributeTypes(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
Check if given mapping attributes are one of the desired attributes.
static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > availableMappingSizes, ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder)
static std::optional< SmallVector< int64_t > > getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k)
Returns the target vector size for the target operation based on the native vector size specified wit...
static DiagnosedSilenceableFailure definiteFailureHelper(std::optional< TransformOpInterface > transformOp, Operation *target, const Twine &message)
static DiagnosedSilenceableFailure checkMappingSpec(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > numParallelIterations, ArrayRef< int64_t > blockOrGridSizes, int factor, bool useLinearMapping=false)
static void replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, OperationOrBlock *parent, Value replacement, ArrayRef< int64_t > availableMappingSizes)
Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
static DiagnosedSilenceableFailure verifyGpuMapping(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
static DiagnosedSilenceableFailure getThreadIdBuilder(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, GpuIdBuilder &gpuIdBuilder)
static std::optional< SmallVector< int64_t > > gpuMmaUnrollOrder(vector::ContractionOp contract)
Pick an unrolling order that will allow tensorcore operation to reuse LHS register.
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
b getContext())
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its * replacement(set to `begin` if no invalidation happens). Since outgoing *copies could have been inserted at `end`
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition TypeID.h:331
Base type for affine expression.
Definition AffineExpr.h:68
Attributes are known-constant values of operations.
Definition Attributes.h:25
Block represents an ordered list of Operations.
Definition Block.h:33
OpListType::iterator iterator
Definition Block.h:150
OpListType & getOperations()
Definition Block.h:147
Operation & front()
Definition Block.h:163
iterator begin()
Definition Block.h:153
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:350
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition Builders.h:447
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:567
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition Builders.h:433
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:400
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:414
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87
result_type_range getResultTypes()
Definition Operation.h:453
user_range getUsers()
Returns a range of all users.
Definition Operation.h:898
unsigned getNumResults()
Return the number of results held by this operation.
Definition Operation.h:429
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
A utility result that is used to signal how to proceed with an ongoing walk:
Definition WalkResult.h:29
static WalkResult skip()
Definition WalkResult.h:48
static WalkResult advance()
Definition WalkResult.h:47
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition WalkResult.h:51
static WalkResult interrupt()
Definition WalkResult.h:46
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:384
A list of results of applying a transform op with ApplyEachOpTrait to a single payload operation,...
void push_back(Operation *op)
Appends an element to the list.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
This is a special rewriter to be used in transform op implementations, providing additional helper fu...
The state maintained across applications of various ops implementing the TransformOpInterface.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
void registerTransformDialectExtension(DialectRegistry &registry)
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
uint64_t getN(LevelType lt)
Definition Enums.h:442
uint64_t getM(LevelType lt)
Definition Enums.h:443
DiagnosedSilenceableFailure findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp, TransformOpInterface transformOp)
Find the unique top level scf::ForallOp within a given target op.
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.
DiagnosedSilenceableFailure mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, SmallVectorImpl< int64_t > &gridDims, const GpuIdBuilder &gpuIdBuilder)
Map the top level scf.forall op to GPU blocks.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition Utils.cpp:360
DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, Operation *target, ArrayRef< int64_t > blockDims, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
DiagnosedSilenceableFailure mapOneForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
Definition VectorOps.h:156
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Definition VectorOps.h:151
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Definition LLVM.h:122
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition Passes.h:91
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
llvm::SetVector< T, Vector, Set, N > SetVector
Definition LLVM.h:125
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
void configureGpuToNVVMTypeConverter(LLVMTypeConverter &converter)
Configure the LLVM type convert to convert types and address spaces from the GPU dialect to NVVM.
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
llvm::TypeSwitch< T, ResultT > TypeSwitch
Definition LLVM.h:139
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition Chipset.cpp:14
Builder for gpu::BlockIdOps used to map scf.forall to blocks.
Definition Utils.h:81
Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.
Definition Utils.h:60
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition Utils.h:69
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition Utils.h:72
Builder for warp ids used to map scf.forall to reindexed threads.
Definition Utils.h:120
Builder for warp ids used to map scf.forall to reindexed warps.
Definition Utils.h:107
Builder for warpgroup ids used to map scf.forall to reindexed warpgroups.
Definition Utils.h:92
static constexpr int64_t kNumWarpsPerGroup
In the future this may be configured by the transformation.
Definition Utils.h:98
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition Utils.h:31
std::string errorMsg
Error message, if not empty then building the ids failed.
Definition Utils.h:33
SmallVector< Value > predicateOps
Values used to predicate the forall body when activeMappingSizes is smaller than the available mappin...
Definition Utils.h:38
SmallVector< Value > mappingIdOps
Values used to replace the forall induction variables.
Definition Utils.h:35
Options that control the vector unrolling.